In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

Answers:
1. 19
2. 46.45
3. 98
4. 515
5. 6.99
6. 7.79


In [2]:
df = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
print(f"Number of columns: {len(df.columns)}")

Number of columns: 19


In [3]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(f"Std for duration is: {df.duration.std():.2f}")

Std for duration is: 46.45


In [4]:
fraction_left = ((df.duration >= 1) & (df.duration <= 60)).mean()
print(f"Fraction left after dropping outliers: {fraction_left * 100:.2f}")

Fraction left after dropping outliers: 98.28


In [5]:
df = df[(df.duration >= 1) & (df.duration <= 60)]
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)

dv = DictVectorizer()

train_dicts = df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
print(f"Dimensionality after OHE: {X_train.shape[1]}")

Dimensionality after OHE: 515


In [7]:
def read_data(file):
    df = pd.read_parquet(file)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    return df

def preprocess(df):
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    train_dicts = df[categorical].to_dict(orient='records')
    target = 'duration'
    y_train = df[target].values

    return train_dicts, y_train

In [8]:
train_df = read_data('./data/yellow_tripdata_2022-01.parquet')
train_dicts, y_train = preprocess(train_df)

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

6.986190769549357

In [9]:
val_df = read_data('./data/yellow_tripdata_2022-02.parquet')
val_dicts, y_val = preprocess(val_df)
X_val = dv.transform(val_dicts)

y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

7.786407870395467