In [1]:
import pandas as pd
import sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename, categorical = ''):
    df = pd.read_parquet(filename, engine='fastparquet')
    df['duration'] = ((df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()/60).round(2)
    mask = (df['duration'] >= 1) & (df['duration'] <= 60)
    df = df[mask].copy()
    df[categorical] = df[categorical].astype(str)
    return df

In [3]:
def train_pipe(df_train, df_val, categorical, target):
    cat_train = df_train[categorical].copy()
    cat_val = df_val[categorical].copy()
    
    train_dicts = cat_train.to_dict('records')
    val_dicts = cat_val.to_dict('records')
    
    vectorizer = DictVectorizer(sparse=True)
    X_train = vectorizer.fit_transform(train_dicts)
    X_val = vectorizer.transform(val_dicts)
    
    y_train = df_train[target].values
    return X_train, X_val, y_train

In [4]:
filename_train = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet'

categorical = ['PULocationID', 'DOLocationID']

df_train = read_dataframe(filename_train, categorical)



In [5]:
filename_val = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet'
df_val = read_dataframe(filename_val, categorical)


In [6]:
df_val.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2.0,2024-02-01 00:04:45,2024-02-01 00:19:58,1.0,4.39,1.0,N,68.0,236.0,1.0,20.5,1.0,0.5,1.28,0.0,1.0,26.78,2.5,0.0,15.22
1,2.0,2024-02-01 00:56:31,2024-02-01 01:10:53,1.0,7.71,1.0,N,48.0,243.0,1.0,31.0,1.0,0.5,9.0,0.0,1.0,45.0,2.5,0.0,14.37
2,2.0,2024-02-01 00:07:50,2024-02-01 00:43:12,2.0,28.69,2.0,N,132.0,261.0,2.0,70.0,0.0,0.5,0.0,6.94,1.0,82.69,2.5,1.75,35.37
3,1.0,2024-02-01 00:01:49,2024-02-01 00:10:47,1.0,1.1,1.0,N,161.0,163.0,1.0,9.3,3.5,0.5,2.85,0.0,1.0,17.15,2.5,0.0,8.97
4,1.0,2024-02-01 00:37:35,2024-02-01 00:51:15,1.0,2.6,1.0,N,246.0,79.0,2.0,15.6,3.5,0.5,0.0,0.0,1.0,20.6,2.5,0.0,13.67


In [None]:
X_train, X_val, y_train = train_pipe(df_train, df_val, categorical, 'duration')

In [None]:
cat_train = df_train[categorical].copy()
cat_val = df_val[categorical].copy()



In [None]:
train_dicts = cat_train.to_dict('records')
val_dicts = cat_val.to_dict('records')



In [None]:
vectorizer = DictVectorizer(sparse=True)
X_train = vectorizer.fit_transform(train_dicts)


In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)

In [None]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']