In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from typing import List

In [3]:
def parquet_to_data_frame(data_folder_path: str) -> List[pd.DataFrame]:
    '''
    This function reads through all the parquet files in the provided folder path and converts them into a list of data frames.
    '''
    data_frame_list = []
    data_year = 2023
    month_name_number_dict_map = {1: 'January',
                              2: 'February',
                              3: 'March',
                              4: 'April',
                              5: 'May',
                              6: 'June',
                              7: 'July',
                              8: 'August',
                              9: 'September',
                              10: 'October',
                              11: 'November',
                              12: 'December'}
    
    for data_month in range(1, 13):
        print(f'Converting parquet file to data frame for month:{month_name_number_dict_map[data_month]} {data_year}!')
        parquet_file_path = data_folder_path / f'taxi_rides_ingested_{data_year}_{data_month:02}.parquet'
        data_table = pq.read_table(parquet_file_path)
        data_frame = data_table.to_pandas()
        print(f'Dimensions of data frame for {month_name_number_dict_map[data_month]} {data_year}: {data_frame.shape}')
        data_frame_list.append(data_frame)

    return data_frame_list

In [4]:
raw_data_folder_path = Path('..') / 'data' / 'raw'
taxi_rides_data = pd.concat(parquet_to_data_frame(raw_data_folder_path))
print(f'Dimensions of taxi rides data frame: {taxi_rides_data.shape}')

Converting parquet file to data frame for month:January 2023!
Dimensions of data frame for January 2023: (3066766, 19)
Converting parquet file to data frame for month:February 2023!
Dimensions of data frame for February 2023: (2913955, 19)
Converting parquet file to data frame for month:March 2023!
Dimensions of data frame for March 2023: (3403766, 19)
Converting parquet file to data frame for month:April 2023!
Dimensions of data frame for April 2023: (3288250, 19)
Converting parquet file to data frame for month:May 2023!
Dimensions of data frame for May 2023: (3513649, 19)
Converting parquet file to data frame for month:June 2023!
Dimensions of data frame for June 2023: (3307234, 19)
Converting parquet file to data frame for month:July 2023!
Dimensions of data frame for July 2023: (2907108, 19)
Converting parquet file to data frame for month:August 2023!
Dimensions of data frame for August 2023: (2824209, 19)
Converting parquet file to data frame for month:September 2023!
Dimensions o

In [5]:
taxi_rides_data.head()
taxi_rides_data['airport_fee'] = taxi_rides_data['airport_fee'].add(taxi_rides_data['Airport_fee'], fill_value = 0)
taxi_rides_data = taxi_rides_data.drop(['Airport_fee'], axis = 1)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,


In [6]:
taxi_rides_data.shape
taxi_rides_data.head()

(38310226, 19)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [7]:
taxi_rides_data_cp_1 = taxi_rides_data.copy()
taxi_rides_data_cp_1['ride_duration'] = taxi_rides_data_cp_1['tpep_dropoff_datetime'] - taxi_rides_data_cp_1['tpep_pickup_datetime']
taxi_rides_data_cp_1.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,ride_duration
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,0 days 00:08:26
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,0 days 00:06:19
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,0 days 00:12:45
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,0 days 00:09:37
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,0 days 00:10:50


In [8]:
taxi_rides_data_cp_1['ride_duration'].describe().T

count                  38310226
mean     0 days 00:16:43.010948
std      3 days 04:04:04.525103
min       -19617 days +03:01:32
25%             0 days 00:07:39
50%             0 days 00:12:40
75%             0 days 00:20:42
max             6 days 23:09:11
Name: ride_duration, dtype: object

In [9]:
taxi_rides_data_cp_1['ride_duration'].quantile(0)
taxi_rides_data_cp_1['ride_duration'].quantile(0.01)
taxi_rides_data_cp_1['ride_duration'].quantile(0.995)
taxi_rides_data_cp_1['ride_duration'].quantile(0.999)

Timedelta('-19617 days +03:01:32')

Timedelta('0 days 00:00:40')

Timedelta('0 days 01:18:10')

Timedelta('0 days 02:20:53')

In [10]:
taxi_rides_duration_filter = (taxi_rides_data_cp_1['ride_duration'] > pd.Timedelta(0)) & (taxi_rides_data_cp_1['ride_duration'] <= pd.Timedelta(hours = 6))
taxi_rides_duration_info_loss = (sum(~taxi_rides_duration_filter) / taxi_rides_data_cp_1.shape[0]) * 100
taxi_rides_duration_info_loss

0.12451244740764515

In [11]:
taxi_rides_data_cp_1['total_amount'].describe().T

count    3.831023e+07
mean     2.846194e+01
std      7.712821e+01
min     -1.094050e+03
25%      1.595000e+01
50%      2.100000e+01
75%      3.072000e+01
max      3.869876e+05
Name: total_amount, dtype: float64

In [12]:
taxi_rides_data_cp_1['total_amount'].quantile(0)
taxi_rides_data_cp_1['total_amount'].quantile(0.01)
taxi_rides_data_cp_1['total_amount'].quantile(0.995)
taxi_rides_data_cp_1['total_amount'].quantile(0.999)

np.float64(-1094.05)

np.float64(1.0)

np.float64(117.96)

np.float64(181.3577500000596)

In [13]:
taxi_rides_total_amount_filter = (taxi_rides_data_cp_1['total_amount'] > 0) & (taxi_rides_data_cp_1['total_amount'] <= 
                                                                               taxi_rides_data_cp_1['total_amount'].quantile(0.999))
taxi_rides_total_amount_info_loss = (sum(~taxi_rides_total_amount_filter) / taxi_rides_data_cp_1.shape[0]) * 100
taxi_rides_total_amount_info_loss

1.0994270824714008

In [14]:
taxi_rides_nyc_pu_locations_filter = ~(taxi_rides_data_cp_1['PULocationID'].isin((1, 264, 265)))
taxi_rides_nyc_pu_locations_info_loss = (sum(~taxi_rides_nyc_pu_locations_filter) / taxi_rides_data_cp_1.shape[0]) * 100
taxi_rides_nyc_pu_locations_info_loss

1.0240712231768092

In [15]:
taxi_rides_sorted_date_asce = taxi_rides_data_cp_1.sort_values(by = 'tpep_pickup_datetime', ascending = True)
taxi_rides_sorted_date_asce.head(10)
taxi_rides_sorted_date_asce.tail(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,ride_duration
1262241,2,2001-01-01 00:06:49,2001-01-01 15:42:11,2.0,7.41,1.0,N,161,231,1,35.9,0.0,0.5,7.0,0.0,1.0,46.9,2.5,0.0,0 days 15:35:22
3033658,2,2001-01-01 00:07:36,2001-01-01 00:16:31,1.0,1.68,1.0,N,239,43,2,10.7,0.0,0.5,0.0,0.0,1.0,14.7,2.5,0.0,0 days 00:08:55
2932889,2,2001-01-01 00:08:31,2001-01-01 16:01:27,1.0,18.63,2.0,N,132,264,1,70.0,0.0,0.5,16.46,6.55,1.0,98.76,2.5,1.75,0 days 15:52:56
3127128,2,2001-01-01 00:08:42,2001-01-01 14:13:51,1.0,17.68,2.0,N,132,68,2,70.0,0.0,0.5,0.0,6.55,1.0,81.8,2.5,1.25,0 days 14:05:09
3040040,2,2001-01-01 00:28:40,2001-01-01 01:11:09,1.0,10.23,1.0,N,239,97,2,48.5,0.0,0.5,0.0,0.0,1.0,52.5,2.5,0.0,0 days 00:42:29
1491898,2,2001-01-01 15:18:51,2001-01-01 22:08:40,2.0,9.13,1.0,N,246,243,1,38.0,1.0,0.5,0.0,0.0,1.0,43.0,2.5,0.0,0 days 06:49:49
1128759,2,2002-12-31 22:16:54,2003-01-01 15:02:08,1.0,3.25,1.0,N,132,10,1,14.9,1.75,0.5,3.63,0.0,1.0,21.78,0.0,0.0,0 days 16:45:14
1360996,2,2002-12-31 22:27:05,2002-12-31 22:46:37,1.0,11.77,1.0,N,132,138,2,44.3,6.0,0.5,0.0,0.0,1.0,51.8,0.0,0.0,0 days 00:19:32
2531205,2,2002-12-31 23:03:18,2003-01-01 21:09:44,1.0,28.19,2.0,N,132,13,1,70.0,0.0,0.5,16.54,6.94,1.0,99.23,2.5,1.75,0 days 22:06:26
466306,2,2002-12-31 23:03:19,2003-01-01 11:52:29,1.0,18.13,2.0,N,132,161,1,70.0,0.0,0.5,16.46,6.55,1.0,98.76,2.5,1.75,0 days 12:49:10


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,ride_duration
3195694,2,2023-12-31 23:59:58,2024-01-01 00:12:49,1.0,1.07,1.0,N,161,229,2,12.1,1.0,0.5,0.0,0.0,1.0,17.1,2.5,0.0,0 days 00:12:51
3196226,2,2023-12-31 23:59:59,2024-01-01 00:27:43,2.0,1.46,1.0,N,186,164,2,22.6,1.0,0.5,0.0,0.0,1.0,27.6,2.5,0.0,0 days 00:27:44
3194016,2,2023-12-31 23:59:59,2024-01-01 00:06:28,1.0,1.38,1.0,N,142,238,1,9.3,1.0,0.5,3.58,0.0,1.0,17.88,2.5,0.0,0 days 00:06:29
3195959,1,2023-12-31 23:59:59,2024-01-01 00:07:56,2.0,2.7,1.0,N,237,151,1,12.8,3.5,0.5,3.55,0.0,1.0,21.35,2.5,0.0,0 days 00:07:57
3195659,2,2024-01-01 00:01:34,2024-01-01 00:05:41,1.0,0.71,1.0,N,234,68,2,6.5,1.0,0.5,0.0,0.0,1.0,11.5,2.5,0.0,0 days 00:04:07
3195759,2,2024-01-01 00:01:58,2024-01-01 00:10:04,3.0,1.7,1.0,N,234,144,1,10.7,1.0,0.5,2.36,0.0,1.0,18.06,2.5,0.0,0 days 00:08:06
2808900,2,2024-01-03 10:00:04,2024-01-03 11:08:22,1.0,21.6,1.0,N,132,136,1,82.1,0.0,0.5,18.46,6.94,1.0,110.75,0.0,1.75,0 days 01:08:18
2808901,2,2024-01-03 17:00:52,2024-01-03 17:01:05,2.0,0.0,5.0,N,265,265,1,120.0,0.0,0.0,0.0,0.0,1.0,121.0,0.0,0.0,0 days 00:00:13
2808902,2,2024-01-03 18:43:26,2024-01-03 18:43:29,2.0,0.01,5.0,N,95,95,1,86.69,0.0,0.0,17.54,0.0,1.0,105.23,0.0,0.0,0 days 00:00:03
2808903,2,2024-01-03 19:42:57,2024-01-03 20:15:55,1.0,16.67,1.0,N,132,165,1,65.3,0.0,0.5,5.0,0.0,1.0,73.55,0.0,1.75,0 days 00:32:58


In [16]:
taxi_rides_date_range_filter = (taxi_rides_data_cp_1['tpep_pickup_datetime'] >= '2023-01-01') & (taxi_rides_data_cp_1['tpep_pickup_datetime'] < '2024-01-01')
taxi_rides_date_range_info_loss = (sum(~taxi_rides_date_range_filter) / taxi_rides_data_cp_1.shape[0]) * 100
taxi_rides_date_range_info_loss

0.00027146798872969323

In [17]:
taxi_rides_filters = taxi_rides_duration_filter & taxi_rides_total_amount_filter & taxi_rides_nyc_pu_locations_filter & taxi_rides_date_range_filter
taxi_rides_data_dropped = taxi_rides_filters.shape[0] - sum(taxi_rides_filters)
taxi_rides_dropped_data_info_loss = (taxi_rides_data_dropped / taxi_rides_filters.shape[0]) * 100
taxi_rides_data_dropped
taxi_rides_dropped_data_info_loss

845967

2.208201538670119

In [18]:
taxi_rides_data = taxi_rides_data[taxi_rides_filters]
taxi_rides_data = taxi_rides_data[['tpep_pickup_datetime', 'PULocationID']]
taxi_rides_data.rename(columns = {'tpep_pickup_datetime': 'pick_date_time',
                                  'PULocationID': 'pickup_location_id'}, inplace = True)
taxi_rides_data.head()
processed_data_folder_path = Path('..') / 'data' / 'processed'
processed_data_file_path = processed_data_folder_path / f'taxi_rides_processed_2023.parquet'
taxi_rides_data.to_parquet(processed_data_file_path, engine = 'pyarrow', index = False)

Unnamed: 0,pick_date_time,pickup_location_id
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107
