In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [3]:
def read_dataframe(filename):
   
    df = pd.read_parquet(filename)

    print(f"No. of Columns in the dataset",df.shape[1])

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    return df

In [4]:
df_train = read_dataframe('../data/yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('../data/yellow_tripdata_2022-02.parquet')

No. of Columns in the dataset 19
No. of Columns in the dataset 19


In [5]:
print(f"standard deviation for duration:",df_train['duration'].std())

standard deviation for duration: 46.44530513776499


In [6]:
# filtering outlier in the duration
# df = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

# categorical = ['PULocationID', 'DOLocationID']
# df[categorical] = df[categorical].astype(str)

In [7]:
def outlier(df):
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    return df

In [8]:
df_train_cleaned = outlier(df_train)
df_val_cleaned = outlier(df_val)

In [9]:
print(f"Fraction of records left in training dataset after dropping outliers:", df_train_cleaned.shape[0]/df_train.shape[0])

Fraction of records left in training dataset after dropping outliers: 0.9827547930522406


In [10]:
categorical = ['PULocationID', 'DOLocationID']
df_train_cleaned[categorical] = df_train_cleaned[categorical].astype(str)
df_val_cleaned[categorical] = df_val_cleaned[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_cleaned[categorical] = df_train_cleaned[categorical].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val_cleaned[categorical] = df_val_cleaned[categorical].astype(str)


In [15]:
dv = DictVectorizer()

train_dicts = df_train_cleaned[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val_cleaned[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)



In [16]:
target = 'duration'
y_train = df_train_cleaned[target].values
y_val = df_val_cleaned[target].values

In [17]:
print(f"X_train Dimensionality after OHE:", X_train.shape[1])

X_train Dimensionality after OHE: 515


In [19]:
## training Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

6.986190135963553

In [20]:
y_pred_val = lr.predict(X_val)
mean_squared_error(y_val, y_pred_val, squared=False)

7.786389499163578