In [80]:
import numpy as np
import pandas as pd
import os

In [81]:
files = [f for f in os.listdir('.') if f.endswith('.parquet')]
files.sort()

print('Files to process:', files)

Files to process: ['apr_2025_cleaned.parquet', 'aug_2025_cleaned.parquet', 'dec_2024_cleaned.parquet', 'feb_2025_cleaned.parquet', 'jan_2025_cleaned.parquet', 'jul_2025_cleaned.parquet', 'jun_2025_cleaned.parquet', 'mar_2025_cleaned.parquet', 'may_2025_cleaned.parquet', 'nov_2025_cleaned.parquet', 'oct_2025_cleaned.parquet', 'sep_2025_cleaned.parquet']


In [84]:
columns_to_load = [
    "pickup_datetime",
    "dropoff_datetime",
    "passenger_count",
    "trip_distance",
    "pu_location_id",
    "do_location_id",
    "payment_type",
    "fare_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "total_amount",
    "airport_fee"
]

dfs = []

for f in files:
    print(f'Processing {f} ...')

    temp = pd.read_parquet(f, columns=columns_to_load)
    temp['pickup_datetime'] = pd.to_datetime(temp['pickup_datetime'])
    temp['dropoff_datetime'] = pd.to_datetime(temp['dropoff_datetime'])
    
    temp["trip_distance"] = temp["trip_distance"].astype("float32")
    temp["fare_amount"] = temp["fare_amount"].astype("float32")
    temp["extra"] = temp["extra"].astype("float32")
    temp["mta_tax"] = temp["mta_tax"].astype("float32")
    temp["tip_amount"] = temp["tip_amount"].astype("float32")
    temp["tolls_amount"] = temp["tolls_amount"].astype("float32")
    temp["total_amount"] = temp["total_amount"].astype("float32")
    temp["passenger_count"] = temp["passenger_count"].astype("int8")
    temp["airport_fee"] = temp["airport_fee"].astype("float32")

    # Derive additional columns useful for EDA/statistics
    temp['trip_duration'] = (temp['dropoff_datetime'] - temp['pickup_datetime']).dt.total_seconds() / 60
    temp['year'] = temp['pickup_datetime'].dt.year
    temp['month'] = temp['pickup_datetime'].dt.month
    temp['day_of_week'] = temp['pickup_datetime'].dt.dayofweek
    temp['hour'] = temp['pickup_datetime'].dt.hour
    temp['is_weekend'] = temp['day_of_week'].isin([5,6])

    dfs.append(temp)
    del temp

print('Concatenating all months....')
df = pd.concat(dfs, ignore_index=True)
del dfs

output_file = 'nyc_taxi_jan_2025_nov_2025.parquet'
df.to_parquet(output_file, engine='pyarrow', index=False,compression='snappy')
print(f'Final Parquet saved as : {output_file}')

Processing apr_2025_cleaned.parquet ...
Processing aug_2025_cleaned.parquet ...
Processing dec_2024_cleaned.parquet ...
Processing feb_2025_cleaned.parquet ...
Processing jan_2025_cleaned.parquet ...
Processing jul_2025_cleaned.parquet ...
Processing jun_2025_cleaned.parquet ...
Processing mar_2025_cleaned.parquet ...
Processing may_2025_cleaned.parquet ...
Processing nov_2025_cleaned.parquet ...
Processing oct_2025_cleaned.parquet ...
Processing sep_2025_cleaned.parquet ...
Concatenating all months....
Final Parquet saved as : nyc_taxi_jan_2025_nov_2025.parquet


In [85]:
# all datasets merged into one!!!