In [None]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [10]:
def df_prepare(df):
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df

def filter_duration(df, min_duration=1, max_duration=60):
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)].copy()

def transform_features(df, dv=None, fit=True):
    df['PULocationID'] = df['PULocationID'].astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype(str)
    dicts = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
    if fit:
        dv = DictVectorizer()
        X = dv.fit_transform(dicts)
        return X, dv
    else:
        X = dv.transform(dicts)
        return X

def train_and_evaluate(X, y):
    model = LinearRegression()
    model.fit(X, y)
    preds = model.predict(X)
    rmse = mean_squared_error(y, preds, squared=False)
    return model, rmse

In [None]:
df = pd.read_parquet('yellow_tripdata_2023-01.parquet')
print("Q1: Number of columns: ", len(df.columns))

Question 1, number of columns:  19


In [None]:
df_train = df_prepare(df)
std_january_duration = df_train['duration'].std()
print(f"Q2; Standard Deviation: {std_january_duration:.2f} minutes")

Question 2, standard deviation: 42.59 minutes


Removing outliers

In [None]:
df_train_filtered = filter_duration(df_train)
print(f"Q3: Fraction of records: {(len(df_train_filtered) *100 / len(df_train)):.2f}%")

Question 3, Fraction of records left after removing outliers: 98.12%


In [None]:
X_train, dv = transform_features(df_train_filtered, fit=True)
print(f"Q4: Dimensionality of Feature Matrix: {X_train.shape[1]}")

Question 4, dimensionality of feature matrix: 515


In [None]:
y_train = df_train_filtered['duration'].values
model, rmse_train = train_and_evaluate(X_train, y_train)
print(f"Q5: RMSE - train: {rmse_train:.2f} minutes")

Question 5, RMSE on train: 7.65 minutes




In [None]:
val = pd.read_parquet('yellow_tripdata_2023-02.parquet')
df_val = df_prepare(val)
df_val_filtered = filter_duration(df_val)
X_val = transform_features(df_val_filtered, dv=dv, fit=False)
y_val = df_val_filtered['duration'].values
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Q6: RMSE - val: {rmse_val:.2f} minutes")

Question 6, validation RMSE: 7.81 minutes


