In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [3]:
# Reading the training data provided by 
# https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

df = pd.read_parquet('Data/fhv_tripdata_2021-01.parquet')


In [4]:
# lets review the shape of pandas df

df.shape

(1154112, 7)

In [5]:
# inspecting first rows of the dataframe
df.head()


Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [6]:
df['dropOff_month'] = df['dropOff_datetime'].dt.month
df['pickup_month'] = df['pickup_datetime'].dt.month
df['duration'] = (df['dropOff_datetime'] - df['pickup_datetime']).dt.total_seconds()/60

In [7]:
f"The average trip duration in January is {df['duration'].mean()}"

'The average trip duration in January is 19.1672240937939'

In [9]:
null_df = (100 * df.isna().sum()) / df.shape[0]
f"The fractions of missing values for the pickup location ID is {null_df['PUlocationID']}"

'The fractions of missing values for the pickup location ID is 83.03067639882438'

In [13]:
# helper function
def prepare_data(df, train=True, model=None):
    print(f'Total records are {df.shape[0]}')
    
    df['dropOff_month'] = df['dropOff_datetime'].dt.month
    df['pickup_month'] = df['pickup_datetime'].dt.month
    df['duration'] = (df['dropOff_datetime'] - df['pickup_datetime']).dt.total_seconds()/60
    
    print(f"The average trip duration in January is {df['duration'].mean()}")
    
    duration_filter = (df['duration']>=1) & (df['duration'] <=60)
    print(f'Total records dropped are {df.shape[0] - df[duration_filter].shape[0]}')
    
    df = df[duration_filter].copy()  
    
    df[['PUlocationID', 'DOlocationID']] = df[['PUlocationID', 'DOlocationID']].fillna(-1)
    
    categorical = ['PUlocationID', 'DOlocationID']
    numerical = ['duration']

    df[categorical] = df[categorical].astype(str)
    
    train_dicts = df[categorical ].to_dict(orient='records')
    
    if train:
        dv = DictVectorizer()
        dv_matrix = dv.fit_transform(train_dicts)
    
        print(f'The dimensionality of this matrix is {dv_matrix.shape[1]}')
        
        return dv, dv_matrix, df['duration'].values
    
    else:
        dv_matrix = model.transform(train_dicts)
        
        return dv_matrix, df['duration'].values

In [14]:
# run data preparation

dvmodel, dv_matrix, y_train = prepare_data(df, train=True)

Total records are 1154112
The average trip duration in January is 19.1672240937939
Total records dropped are 44286
The dimensionality of this matrix is 525


In [15]:
# model fit

lr = LinearRegression()
lr.fit(dv_matrix, y_train)

LinearRegression()

In [16]:
# model predicting on the test data

y_pred = lr.predict(dv_matrix)
mean_squared_error(y_train, y_pred, squared=False)

10.528519107212672

In [17]:
# reading the validation dataset

valid_data = pd.read_parquet('Data/fhv_tripdata_2021-02.parquet')

In [18]:
# model evaluation the the validation dataset

valid_dv_matrix, y_valid = prepare_data(valid_data, train=False, model=dvmodel)

Total records are 1037692
The average trip duration in January is 20.70698622520125
Total records dropped are 47579
