In [54]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [55]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

In [63]:
def load(file):
    df = pd.read_parquet(file)
    print(df.columns)
    # print(df)
    print(df.columns.size)
    print('shape at beginning ', df.shape)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    # print(df.duration)
    df.duration = df.duration.apply(lambda x: x.total_seconds()/60)
    # print(df.duration)
    print('std deviation of duration', df.duration.std())
    total = df.shape[0]
    records_in_1_60_min_range = df[ (df.duration >= 1) & (df.duration <= 60) ].shape[0]
    percentage_in_range = round(records_in_1_60_min_range/total, 2)
    print('percentage of outliers',percentage_in_range)
    #remove outliers
    df = df[ (df.duration >= 1) & (df.duration <= 60) ]
    print('shape after removing outliers', df.shape)

    #PULocationID	DOLocationID
    categories = ['PULocationID', 'DOLocationID']
    df[categories] = df[categories].astype(str)
    data_dict = df[categories].to_dict(orient='records')
    print('categories shape ' , df[categories].shape)
    return df, data_dict

# Load training data (January 2023)
### Q1. Downloading the data
### Q2. Computing duration
### Q3. Dropping outliers
### Q4. One-hot encoding

In [65]:
df_train, train_data_dict = load('../data/yellow_tripdata_2023-01.parquet')

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')
19
shape at beginning  (3066766, 19)
std deviation of duration 42.59435124195458
percentage of outliers 0.98
shape after removing outliers (3009173, 20)
categories shape  (3009173, 2)


## use vectorizer on train data

In [69]:
dv = DictVectorizer()
x_train = dv.fit_transform(train_data_dict)
x_train.shape

(3009173, 515)

# Load test data Feb 2023

In [67]:
df_val, val_data_dict = load('../data/yellow_tripdata_2023-02.parquet')

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee'],
      dtype='object')
19
shape at beginning  (2913955, 19)
std deviation of duration 42.84210176105113
percentage of outliers 0.98
shape after removing outliers (2855951, 20)
categories shape  (2855951, 2)


## use vectorizer on test data

In [70]:
x_val = dv.transform(val_data_dict)

## Q5. train the model

In [76]:
y_train = df_train['duration'].values

In [74]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_train)

In [75]:
root_mean_squared_error(y_train, y_pred)

7.649261929201487

## Q6. Evaluating the model

In [77]:
y_val = df_val['duration'].values
y_pred = lr.predict(x_val)

In [78]:
root_mean_squared_error(y_val, y_pred)

7.811819793542861