In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import pickle

In [64]:
filename = 'data/fhv_tripdata_2021-01.parquet'
df = pd.read_parquet(filename)
df.shape[0]

1154112

Number of records is 1154112 

In [65]:
df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
df['duration'].mean()

19.1672240937939

Average trip duration is 19.16

In [66]:
len(df[(df['duration'] < 1) | (df['duration'] > 60)])

44286

Outliers records to drop is 44286

In [67]:
df = df[(df['duration'] >=1) & (df['duration'] <=60)]
df = df[['PUlocationID', 'DOlocationID','duration']]
df = df.fillna(value=-1)
(df['PUlocationID'] == -1).mean()

0.8352732770722617

Fractions of missing values for the pickup location ID is 83%

In [68]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    df = df[(df['duration'] >=1) & (df['duration'] <=60)]
    df = df[['PUlocationID', 'DOlocationID','duration']]
    df = df.fillna(value=-1)
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    return df

In [69]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')

In [70]:
categorical = ['PUlocationID', 'DOlocationID']
target = 'duration'

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target].values


In [71]:
X_train.shape[1]

525

Dimensionality of this matrix is 525

In [72]:
l = LinearRegression()
l.fit(X_train, y_train)

y_pred = l.predict(X_train)
mean_squared_error(y_pred, y_train, squared=False)

10.528519107204893

RMSE on train is 10.52

In [73]:
y_pred = l.predict(X_val)
mean_squared_error(y_pred, y_val, squared=False)

11.014283140085958

RMSE on validation is 11.01