### Import Libraries

In [1]:
import pandas as pd
import os
import calendar
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from pandas_profiling import ProfileReport
import numpy as np

### Pre-Processing: EDA Dataset

Wrangle each of the individual files separately (One file per month & year), and return the transformed pandas dataframe

In [8]:
def wrangle_file(file_path, year_of_file):
    ## Read the files
    df = pd.read_parquet(file_path, engine='pyarrow')

    ## Data Cleansing
    columns_drop = [
        'VendorID', 'airport_fee', 'RatecodeID', 'store_and_fwd_flag', 'extra'
        , 'mta_tax', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge'
    ]
    df = df.drop(columns=columns_drop)
    df = df.dropna()

    # Keep only data reported for a span of a year before the year of the file
    df = df[df.tpep_pickup_datetime.dt.year.between(year_of_file-1, year_of_file, inclusive='both')]

    # Dropoff should be after pickup
    df = df[df['total_amount']>0]
    df = df[(df['passenger_count']>0) & (df['passenger_count']<7)]
    df = df[(df['trip_distance']>0) & (df['trip_distance']<=50)]
    df = df[(df['fare_amount']>0)]
    df = df[(df['tip_amount']<=100)]
    

    ## Transformations
    # Get the trip duration and filter to trips with duration greater to zero
    df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.seconds
    df = df[(df['trip_duration']>0)]

    # Round observations to the level of hour
    df['pickup_datetime'] = df['tpep_pickup_datetime'].dt.floor('h')

    # Add Pickup & Dropoff Zone Data
    df['pickup_from'] = np.select( [
            df['PULocationID'].isin([1, 132, 138]) # Location is one of the three NYC airports
            , ~df['PULocationID'].isin([1, 132, 138]) # Location is NOT one of the three NYC airports
        ]
        , [
            'Airport'
            , 'Other'
        ],
        default='Unknown'
    )

    df['dropoff_at'] = np.select( [
            df['DOLocationID'].isin([1, 132, 138]) # Location is one of the three NYC airports
            , ~df['DOLocationID'].isin([1, 132, 138]) # Location is NOT one of the three NYC airports
        ]
        , [
            'Airport'
            , 'Other'
        ],
        default='Unknown'
    )
    
    # Dropped this as I didn't really need the join for the analysis I was doing, just the ID's of the airports
    # df_zones.columns = ['pickup_id', 'pickup_borough', 'pickup_zone', 'pickup_service_zone']
    # df = df.merge(df_zones, how='left', left_on='PULocationID', right_on='pickup_id')
    # df_zones.columns = ['dropoff_id', 'dropoff_borough', 'dropoff_zone', 'dropoff_service_zone']
    # df = df.merge(df_zones, how='left', left_on='DOLocationID', right_on='dropoff_id')

    df['trips'] = 1

    # Aggregate the data at the hour level
    df_hourly = df.groupby([
        'pickup_datetime', 'payment_type', 'pickup_from', 'dropoff_at'
    ]).agg({
        'passenger_count': ['mean']
        , 'trip_distance': ['mean']
        , 'fare_amount': ['mean']
        , 'tip_amount': ['mean']
        , 'trip_duration': ['mean']
        , 'total_amount': ['mean']
        , 'trips': ['sum']
    })

    df_hourly = df_hourly.reset_index()

    df_hourly.columns = [
        'pickup_datetime'
        , 'payment_type'
        , 'pickup_from'
        , 'dropoff_at'
        , 'passenger_count'
        , 'trip_distance'
        , 'fare_amount'
        , 'tip_amount'
        , 'trip_duration'
        , 'total_amount'
        , 'trips'
    ]

    # Get the floor of the average of passengers
    df_hourly['passenger_count'] = df_hourly['passenger_count'].apply(np.floor).astype('int')

    # New Categorical Variables
    df_hourly['congestion_category'] = np.select( [
            df_hourly['pickup_datetime'].dt.hour.between(0, 5, inclusive='left')
            , df_hourly['pickup_datetime'].dt.hour.between(5, 8, inclusive='left')
            , df_hourly['pickup_datetime'].dt.hour.between(8, 13, inclusive='left')
            , df_hourly['pickup_datetime'].dt.hour.between(13, 17, inclusive='left')
            , df_hourly['pickup_datetime'].dt.hour.between(17, 24, inclusive='left' ) 
        ]
        , [
            'After Midnight Congestion'
            , 'Early Morning Congestion'
            , 'Leading to Noon Congestion'
            , 'Afternoon Congestion'
            , 'Evening Congestion'
        ],
        default='Unknown'
    )

    df_hourly['day_category'] = np.select( [
            df_hourly['pickup_datetime'].dt.hour.between(0, 6, inclusive='left')
            , df_hourly['pickup_datetime'].dt.hour.between(6, 12, inclusive='left')
            , df_hourly['pickup_datetime'].dt.hour.between(12, 18, inclusive='left')
            , df_hourly['pickup_datetime'].dt.hour.between(18, 24, inclusive='left')
        ]
        , [
            'Early Morning'
            , 'Late Morning'
            , 'Afternoon'
            , 'Evening'
        ],
        default='Unknown'
    )

    df_hourly['payment_type'] = np.select( [
            df_hourly['payment_type'] == 1
            , df_hourly['payment_type'] == 2
            , df_hourly['payment_type'] == 3
            , df_hourly['payment_type'] == 4
            , df_hourly['payment_type'] == 5
            , df_hourly['payment_type'] == 6
        ]
        , [
            'Credit Card'
            , 'Cash'
            , 'No Charge'
            , 'Dispute'
            , 'Unknown'
            , 'Voided Trip'
        ],
        default='Unknown'
    )

    df_hourly['source_file'] = file_path

    return df_hourly

For each file apply the function specified above, and output a unioned dataframe of the result of all

In [5]:
## Read the files
df_zones = pd.read_csv( r"F:\BFD Project Data\Complementary\taxi+_zone_lookup.csv")

file_path=r'F:\BFD Project Data\Raw'
df_hourly_combined = pd.DataFrame()

for year in os.listdir(file_path):
    for file in os.listdir('{0}/{1}'.format(file_path, year)):
        path = '{0}\{1}\{2}'.format(file_path, year, file)
        df_wrangled = wrangle_file(path, year_of_file=int(year))
        df_hourly_combined = pd.concat([df_hourly_combined, df_wrangled])

# df_keep = df_hourly_combined.copy()

Transform the unioned dataset, this is the EDA Dataset. Then, export it for analysis.

In [9]:
## Final Processing
# Get only trips with a maximum of $150 as the total_amout paid
df_hourly_combined = df_hourly_combined[df_hourly_combined['total_amount'].between(0, 150, inclusive='right')]

# Get only trips with a maximum duration of an hour and 30 mins
df_hourly_combined = df_hourly_combined[df_hourly_combined['trip_duration'].between(0, 5400, inclusive='right')]

# After flooring the passenger count, remove trips with no passengers
df_hourly_combined = df_hourly_combined[df_hourly_combined['passenger_count'] > 0]

# Sort by pickup_datetime
df_hourly_combined = df_hourly_combined.sort_values('pickup_datetime')

## Write to File
df_hourly_combined['id'] = [*range(0, df_hourly_combined.shape[0], 1)]
df_hourly_combined.set_index(df_hourly_combined['id'])
path_file_output = 'F:\BFD Project Data\Processed\yellow_taxi_data_hourly_v6.csv'
df_hourly_combined.to_csv(path_file_output, index=False)

### Pre-Processing: Modelling Dataset

In [2]:
## Post EDA Processing
path_file_output = 'F:\BFD Project Data\Processed\yellow_taxi_data_hourly_v6.csv'
df_hourly = pd.read_csv(path_file_output, parse_dates=['pickup_datetime'], infer_datetime_format=True)
df_hourly['pickup_hour_year'] = df_hourly['pickup_datetime'].dt.strftime('%Y, %H hrs')
df_hourly['pickup_month_year'] = df_hourly['pickup_datetime'].dt.strftime('%B, %Y')

In [23]:

## Write to File (Ready for Modelling Datasets)
df_hourly_amount_per_year = df_hourly[['pickup_hour_year', 'total_amount']].groupby('pickup_hour_year').mean('total_amount')
df_hourly_amount_per_year = df_hourly_amount_per_year.reset_index()
df_hourly_amount_per_year.to_csv('F:\BFD Project Data\Processed\hourly_amount_per_year.csv', index=False)

df_hourly_trips_per_year = df_hourly[['pickup_hour_year', 'trips']].groupby('pickup_hour_year').mean('trips')
df_hourly_trips_per_year = df_hourly_trips_per_year.reset_index()
df_hourly_trips_per_year.to_csv('F:\BFD Project Data\Processed\hourly_trips_per_year.csv', index=False)

# df_hourly_amount_per_year_pickup = df_hourly[['pickup_hour_year', 'pickup_from', 'total_amount']]
# df_hourly_amount_per_year_pickup.to_csv('F:\BFD Project Data\Processed\hourly_amount_per_year_pickup.csv', index=False)

# df_monthly_trips_per_year = df_hourly[['pickup_month_year', 'trips']]
# df_monthly_trips_per_year.to_csv('F:\BFD Project Data\Processed\monthly_trips_per_year.csv', index=False)

Built this function to remove boxplot outliers based on a partition of other column (Not in use right now)

In [104]:
# def remove_outliers(df, column, partition_col):
#     df_result = pd.DataFrame()
#     partitions = df[partition_col].drop_duplicates().to_list()
#     for partition in partitions:
#         df_filtered = df[df[partition_col]==partition]
#         q1 = df_filtered[column].quantile(q=0.25)
#         q3 = df_filtered[column].quantile(q=0.75)
#         iqr = q3 - q1   
#         min = q1 - 1.5 * iqr
#         max = q3 + 1.5 * iqr
#         #df = df.drop(axis=0)
#         df_filtered = df_filtered[(df[partition_col]==partition) & (df[column].between(min, max))]
#         pd.concat(df_result, df_filtered)
        
#         print(partition)
#         print('Q1: {0}, Q3: {1}, IQR: {2}, Min: {3}, Max: {4}'.format(q1, q3, iqr, min, max))
#         print(df_filtered[column].describe())
#         print()
    
#     return df_filtered

### Pandas Profiler: EDA

In [3]:
profile = ProfileReport(df_hourly, title = 'Pandas Profiling Report')
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return func(*args, **kwargs)


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…