# Dependency

To reading and writing Parquet files in Python, install engine libraries first.
`!pip install pandas pyarrow`.

In [None]:
!pip install pandas pyarrow



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_parquet('./yellow_tripdata_2024-01.parquet')
df = df.dropna()
df.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0


In [None]:
df = df[df['trip_distance'] < df['trip_distance'].quantile(0.99)]

df['trip_distance'].plot(kind='hist', bins=100, density=True)
plt.xlabel('Trip Distance')
plt.ylabel('Density')
plt.title('Trip Distance Distribution')
plt.show()

In [None]:
taxi_zones = gpd.read_file('./taxi_zones.shp')
taxi_zones

In [None]:
taxi_zones.plot()
plt.title('NYC Taxi Zones')
plt.axis('off')
plt.show()

In [None]:
df_grouped = df.groupby(['PULocationID', 'DOLocationID']).size().reset_index(name='count')
map_name = taxi_zones.set_index('LocationID')['zone'].to_dict()
df_grouped['PULocation_name'] = df_grouped['PULocationID'].map(map_name)
df_grouped['DOLocation_name'] = df_grouped['DOLocationID'].map(map_name)
df_grouped.dropna(inplace=True)
df_grouped

## Randomly select K destination and departure locations for simplifying the problem.

In [None]:
np.random.seed(0)
k = 5
df_grouped_middle = df_grouped[(df_grouped['count'] >= df_grouped['count'].quantile(0.8)) & (df_grouped['count'] <= df_grouped['count'].quantile(0.99))]

random_rows = df_grouped_middle.sample(k)
random_k_PU, random_k_DO = random_rows['PULocationID'].values, random_rows['DOLocationID'].values
random_k_PU_name, random_k_DO_name = random_rows['PULocation_name'].values, random_rows['DOLocation_name'].values
random_k_count = random_rows['count'].values
set_1_pu, set_1_do = random_k_PU, random_k_DO
random_k_PU, random_k_DO, random_k_PU_name, random_k_DO_name, random_k_count

In [None]:
np.random.seed(1)
k = 5
df_grouped_middle = df_grouped[(df_grouped['count'] >= df_grouped['count'].quantile(0.8)) & (df_grouped['count'] <= df_grouped['count'].quantile(0.99))]

random_rows = df_grouped_middle.sample(k)
random_k_PU, random_k_DO = random_rows['PULocationID'].values, random_rows['DOLocationID'].values
random_k_PU_name, random_k_DO_name = random_rows['PULocation_name'].values, random_rows['DOLocation_name'].values
random_k_count = random_rows['count'].values
set_2_pu, set_2_do = random_k_PU, random_k_DO
random_k_PU, random_k_DO, random_k_PU_name, random_k_DO_name, random_k_count

In [None]:
set_1_pu, set_1_do, set_2_pu, set_2_do

# Classification

In [None]:
def preset(df_raw, set_pu, set_do):
    df = df_raw.copy()
    df['pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df = df.set_index('pickup_datetime')
    df = df.sort_index()

    train = df['2024-01-15':'2024-01-21']
    test = df['2024-01-22':'2024-01-28']

    train = train[train['DOLocationID'].isin(set_do) & train['PULocationID'].isin(set_pu)]
    test = test[test['DOLocationID'].isin(set_do) & test['PULocationID'].isin(set_pu)]
    test = test[test['DOLocationID'].isin(train['DOLocationID'])]

    print(len(train), len(test))
    print(train.columns)
    return train, test

def extend_data(df_train, df_test):
    df_train['hour'] = pd.cut(df_train.index.hour, bins=[0, 6, 12, 18, 24], labels=['morning', 'afternoon', 'evening', 'night'])
    df_train['weekday'] = df_train.index.weekday
    df_test['hour'] = pd.cut(df_test.index.hour, bins=[0, 6, 12, 18, 24], labels=['morning', 'afternoon', 'evening', 'night'])
    df_test['weekday'] = df_test.index.weekday

    merged = pd.concat([df_train, df_test], axis=0)
    merged = pd.get_dummies(merged, columns=['hour', 'weekday'])
    merged = pd.get_dummies(merged, columns=['PULocationID'])
    df_train = merged[:len(df_train)]
    df_test = merged[len(df_train):]

    return df_train, df_test

def conduct_exp(train, test, exp_name):
    feature_columns = ['PULocationID', 'passenger_count', 'trip_distance', 'fare_amount',
    'tip_amount', 'congestion_surcharge', 'tolls_amount', 'total_amount']

    X_train = train[feature_columns]
    y_train = train['DOLocationID']

    X_test = test[feature_columns]
    y_test = test['DOLocationID']

    X_train, X_test = extend_data(X_train, X_test)

    print(X_train.columns)

    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f'{exp_name} Accuracy: {accuracy_score(y_test, y_pred)}')

In [None]:
train, test = preset(df, set_2_pu, set_2_do)
conduct_exp(train, test, 'Set 2')

In [None]:
train, test = preset(df, set_1_pu, set_1_do)
conduct_exp(train, test, 'Set 1')