In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

In [2]:
# Load and clean the dataset
def load_and_clean(parquet_url):
    df = pd.read_parquet(parquet_url, engine='pyarrow')
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['trip_duration_minutes'] = (
        (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    )
    df = df[(df['trip_duration_minutes'] >= 1) & (df['trip_duration_minutes'] <= 60)]
    return df

In [3]:
TRAIN_URL = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet'
VAL_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-02.parquet"

In [4]:
train_df = load_and_clean(TRAIN_URL)
val_df = load_and_clean(VAL_URL)

In [5]:
# Select and convert relevant columns
train_df_selected = train_df[['PULocationID', 'DOLocationID']].astype(str)
train_df_selected = train_df_selected.to_dict(orient='records')

In [6]:
# Apply one-hot encoding
dv = DictVectorizer()
X_train = dv.fit_transform(train_df_selected)

In [8]:
y_train = train_df.trip_duration_minutes.values

In [7]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [9]:
# Train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict and calculate RMSE
y_pred = lr.predict(X_train)
rmse = mean_squared_error(y_train, y_pred, squared=False)
print(f"RMSE on training data: {rmse:.2f}")

RMSE on training data: 7.71


In [10]:
# Transform February data with the same DictVectorizer
X_val = dv.transform(val_df[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records'))
y_val = val_df['trip_duration_minutes'].values

# Predict and evaluate
y_pred = lr.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred, squared=False)
print(f"RMSE on validation data (February): {rmse_val:.2f}")

RMSE on validation data (February): 7.95
