# Import libraries and data

In [1]:
#import libraries
import pandas as pd
import pyarrow
import glob
import warnings
warnings.filterwarnings('ignore')

In [2]:
#display max 500 columns of dfs for the whole notebook
pd.set_option('display.max_columns', 500)
#display two decimals number for all float data for the whole notebook
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
#merge all files ending with parquet in  "C:/Users/Anaïs WERNLE/Desktop/NYC trips/green" and store data in greendf_merged
green = glob.glob("C:/Users/Anaïs WERNLE/Desktop/NYC trips/green/*.parquet")
greendf_merged= pd.concat([pd.read_parquet(f) for f in green])

# Data cleaning : identify and delete anomalies
### ex : ride that last longer than a full day, more than 4 passagers in a taxi ... 

In [4]:
#look for anomalies such as empty columns, mins that are too low or max that are too high
greendf_merged.describe(include='all')

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,1068755.0,1068755,1068755,656321,656321.0,1068755.0,1068755.0,656321.0,1068755.0,1068755.0,1068755.0,1068755.0,1068755.0,1068755.0,0.0,1068755.0,1068755.0,656321.0,656321.0,656321.0
unique,,872520,873658,2,,,,,,,,,,,0.0,,,,,
top,,2021-08-11 12:37:00,2021-11-06 00:00:00,N,,,,,,,,,,,,,,,,
freq,,16,12,651335,,,,,,,,,,,,,,,,
first,,2008-12-31 19:16:53,2008-12-31 19:34:36,,,,,,,,,,,,,,,,,
last,,2021-12-31 23:59:00,2022-01-01 21:05:53,,,,,,,,,,,,,,,,,
mean,1.9,,,,1.17,109.14,132.84,1.27,147.57,20.14,1.11,0.29,1.15,0.58,,0.3,23.93,1.41,1.04,0.64
std,0.3,,,,0.93,70.34,76.83,0.9,4121.76,15.94,1.34,0.25,2.5,1.94,,0.03,17.48,0.51,0.19,1.16
min,1.0,,,,1.0,1.0,1.0,0.0,0.0,-300.0,-5.5,-0.5,-86.0,0.0,,-0.3,-300.8,1.0,1.0,-2.75
25%,2.0,,,,1.0,61.0,69.0,1.0,1.34,9.0,0.0,0.0,0.0,0.0,,0.3,11.8,1.0,1.0,0.0


In [5]:
#change lpep_pickup_datetime and lpep_dropoff_datetime to dateformat (which is necessary to filter the df as written below 
greendf_merged['lpep_pickup_datetime'] = pd.to_datetime(greendf_merged['lpep_pickup_datetime'], format='%YYYY-%MM-%DD %H:%M:%S')
greendf_merged['lpep_dropoff_datetime'] = pd.to_datetime(greendf_merged['lpep_dropoff_datetime'], format='%YYYY-%MM-%DD %H:%M:%S')

#delete all rows of df where pickup happens after dropoff
greendf_merged = greendf_merged[greendf_merged['lpep_pickup_datetime'] < greendf_merged['lpep_dropoff_datetime']]

#delete all data that are not from 2021 
greendf_merged = greendf_merged[(greendf_merged['lpep_dropoff_datetime'].dt.year == 2021)]

In [6]:
#add time in new columns "pick_time" and "drop off" 
greendf_merged["pickup_time"] = greendf_merged['lpep_pickup_datetime'].dt.time
greendf_merged["dropoff_time"] = greendf_merged['lpep_dropoff_datetime'].dt.time

#assign the hour format to those columns
greendf_merged['dropoff_time'] = pd.to_datetime(greendf_merged['dropoff_time'], format='%H:%M:%S')
greendf_merged['pickup_time'] = pd.to_datetime(greendf_merged['pickup_time'], format='%H:%M:%S')

#calculate time between pick-up and drop-off
greendf_merged['trip_duration_min'] = greendf_merged['lpep_dropoff_datetime'].sub(greendf_merged['lpep_pickup_datetime']).dt.total_seconds().div(60)

In [7]:
#delete all rows with a ride that lasts more than 4 hours
greendf_merged = greendf_merged[(greendf_merged['trip_duration_min'] < 240) & (greendf_merged['trip_duration_min'] >= 1)] 

##delete all rows with a ride that lasts more than 4 hours that cost less than $2.50 which is the miminum price in NY
greendf_merged = greendf_merged[(greendf_merged['total_amount'] >= 2.50)]

#delete all rows with a ride with trip_distance > 50km. 
greendf_merged = greendf_merged[(greendf_merged['trip_distance'] <= 50)]

#delete all row with location_id corresponding to uknown area (location_id = 264 or 265)
greendf_merged = greendf_merged[(greendf_merged['PULocationID'] != 265) & (greendf_merged['PULocationID'] != 264)]
greendf_merged = greendf_merged[(greendf_merged['DOLocationID'] != 265) & (greendf_merged['DOLocationID'] != 264)]

#calculate the price by minute of each ride
greendf_merged["price_by_min"] = (greendf_merged['total_amount']/greendf_merged['trip_duration_min'])

#delete all rows with price by min higher than $5 and lower than $0
#the median price of all rides is $1.45 by minute (to calculate it : median_price_minutes = (greendf_merged['total_amount']/greendf_merged['trip_duration_min']).median())
greendf_merged = greendf_merged[(greendf_merged["price_by_min"] < 5) & (greendf_merged["price_by_min"] > 0)]

#calculate average speed of each rides
greendf_merged['avg_speed_miles'] = greendf_merged["trip_distance"]/(greendf_merged['trip_duration_min']/60)

#delete all rows with average speed higher than 70 mph (as the limitation in NY state is 55 mph. 
greendf_merged = greendf_merged[(greendf_merged["avg_speed_miles"] < 100)]

#delete all rows with pasenger count = 0 or is na
greendf_merged = greendf_merged[(greendf_merged['passenger_count'] > 0) & (greendf_merged['passenger_count'] < 5)]
greendf_merged = greendf_merged.dropna(subset=['passenger_count'])

In [8]:
greendf_merged.describe(include = 'all')

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,pickup_time,dropoff_time,trip_duration_min,price_by_min,avg_speed_miles
count,597787.0,597787,597787,597787,597787.0,597787.0,597787.0,597787.0,597787.0,597787.0,597787.0,597787.0,597787.0,597787.0,0.0,597787.0,597787.0,597787.0,597787.0,597787.0,597787,597787,597787.0,597787.0,597787.0
unique,,588875,589373,2,,,,,,,,,,,0.0,,,,,,78858,78426,,,
top,,2021-06-14 09:59:53,2021-06-03 18:10:21,N,,,,,,,,,,,,,,,,,1900-01-01 15:18:53,1900-01-01 00:00:00,,,
freq,,4,3,593551,,,,,,,,,,,,,,,,,30,166,,,
first,,2020-12-31 23:57:51,2021-01-01 00:04:56,,,,,,,,,,,,,,,,,,1900-01-01 00:00:00,1900-01-01 00:00:00,,,
last,,2021-12-31 23:54:19,2021-12-31 23:59:35,,,,,,,,,,,,,,,,,,1900-01-01 23:59:58,1900-01-01 23:59:59,,,
mean,1.83,,,,1.1,98.32,133.44,1.12,3.54,15.34,0.39,0.49,1.4,0.34,,0.3,18.82,1.4,1.02,0.66,,,17.35,1.25,11.84
std,0.38,,,,0.61,63.5,76.51,0.4,4.46,14.07,0.68,0.08,2.24,1.5,,0.01,15.51,0.5,0.14,1.18,,,16.59,0.47,6.09
min,1.0,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2.75,1.0,1.0,0.0,,,1.0,0.02,0.0
25%,2.0,,,,1.0,55.0,74.0,1.0,1.2,7.5,0.0,0.5,0.0,0.0,,0.3,9.36,1.0,1.0,0.0,,,7.72,0.95,8.31


# Standardization : unify column names and change to the right data types 

### ex : delete all capital letters and spaces in columns names, change ID columns to string format

In [9]:
#display column names 
greendf_merged.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'pickup_time', 'dropoff_time', 'trip_duration_min', 'price_by_min',
       'avg_speed_miles'],
      dtype='object')

In [10]:
#lower the case of all column names
greendf_merged.columns= greendf_merged.columns.str.lower()
#change column names
greendf_merged.rename(columns = {'pulocationid':'pickup_location_id', 'dolocationid':'dropoff_location_id', 'vendorid' : 'vendor_id', 'ratecodeid' : 'rate_code_id'}, inplace = True)
greendf_merged= greendf_merged.drop([ "vendor_id", "store_and_fwd_flag", "payment_type", 
                                       "improvement_surcharge", "tolls_amount", "fare_amount", "extra", "congestion_surcharge", "rate_code_id", "mta_tax"], axis=1)

In [11]:
#verify all data types
greendf_merged.dtypes

lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
pickup_location_id                int64
dropoff_location_id               int64
passenger_count                 float64
trip_distance                   float64
tip_amount                      float64
ehail_fee                        object
total_amount                    float64
trip_type                       float64
pickup_time              datetime64[ns]
dropoff_time             datetime64[ns]
trip_duration_min               float64
price_by_min                    float64
avg_speed_miles                 float64
dtype: object

In [12]:
#change data types
greendf_merged["passenger_count"] = greendf_merged["passenger_count"].astype(int)
greendf_merged["pickup_location_id"] = greendf_merged["pickup_location_id"].astype(str)
greendf_merged["dropoff_location_id"] = greendf_merged["dropoff_location_id"].astype(str)

In [13]:
#export data
greendf_merged.to_parquet('green.gzip', compression='gzip')