In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso # Linear Regression with regularization
from sklearn.linear_model import Ridge # regularised linear regression
from sklearn.metrics import mean_squared_error

In [2]:
# !pip install pyarrow

In [3]:
# df = pd.read_parquet('../data/green_tripdata_2021-01.parquet')

In [4]:
# df

In [5]:
# df.info()

In [6]:
# plt.figure(figsize=(10,7))
# sns.heatmap(df == df.isna(), cbar = False, yticklabels = False)
# plt.title("Missing Values in the Dataset")

In [7]:
def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60) ]
    
    categorical = ['PULocationID','DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [8]:
df_train = read_data('../data/green_tripdata_2021-01.parquet')
df_val = read_data('../data/green_tripdata_2021-02.parquet')

In [9]:
len(df_train), len(df_val)

(73908, 61921)

In [10]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [22]:
def train_model(lr = Ridge()):
    # For Rapid and clean experimentaion
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)
    print('train mse',mean_squared_error(y_train, y_pred, squared=False))
    
    y_pred = lr.predict(X_val)
    print('val mse',mean_squared_error(y_val, y_pred, squared=False))

<IPython.core.display.Javascript object>

In [21]:
train_model(LinearRegression())

train mse 9.775464279699293
val mse 10.473870318481026


In [24]:
train_model(Lasso())

train mse 11.562050466293025
val mse 12.212583224318818


In [25]:
train_model(Ridge())

<IPython.core.display.Javascript object>

train mse 10.236288428970283
val mse 10.860753390355725


**Creating duration from dropoff and pickup times**

In [None]:
df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime

# Changing the duration from type 'timedelta64' format to minutes
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [None]:
df.duration.describe(percentiles=[0.25,0.46,0.97,0.98,0.99])

**Exploring rides that are under 1 minute**

In [None]:
# short_df = df[df.duration<1]
# short_df

In [None]:
# sns.displot(short_df.duration)

In [None]:
# sns.scatterplot(x = short_df.duration, y = short_df.trip_distance)

In [None]:
# sns.scatterplot(x = short_df.duration, y = short_df.fare_amount)

In [None]:
# short_df.trip_type.value_counts()
# All of them are from dispatch

In [None]:
# Filtering dataset by taking only rides from dispatch
# df = df[df.trip_type == 2]
# df

In [None]:
# df.duration.describe(percentiles=[0.25,0.46,0.97,0.98,0.99])

**Filtering rides that are too long and too short in duration**

In [None]:
# cleaning
df = df[(df.duration >= 1) & (df.duration <= 60) ]
df.info()

In [None]:
sns.displot(df.duration)

In [None]:
sns.scatterplot(x = df.duration, y = df.fare_amount)
# There's few rides with negative fares

In [None]:
sns.scatterplot(x = df.duration, y = df.trip_distance)

**Selecting features to build a regression model to predict duration**

In [None]:
# Features
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

In [None]:
df[categorical] = df[categorical].astype(str)

In [None]:
# df[categorical + numerical].head().to_dict(orient='records')

In [None]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

In [None]:
# converting dict to vect
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [None]:
X_train

In [None]:
# For each location of Pick UP and DRop off we have a feature (i.e onehot encoded)
# dv.feature_names_

In [None]:
target = 'duration'
y_train = df[target].values

In [None]:
# Trianing
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_train)

In [None]:
sns.distplot(y_pred, label ='Prediction')
sns.distplot(y_train, label ='Actual')

In [None]:
mean_squared_error(y_train, y_pred, squared=False)

In [None]:
# Trianing with Lasso
las = Lasso()
las.fit(X_train, y_train)

y_pred = las.predict(X_train)

sns.distplot(y_pred, label ='Prediction')
sns.distplot(y_train, label ='Actual')

mean_squared_error(y_train, y_pred, squared=False)