In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Set display format for pandas
pd.set_option('display.float_format', '{:.10f}'.format)

# Utility function to print the current state
def print_state(message):
    print(f'[INFO] {message}')

In [2]:
def load_data(filepath):
    print_state(f'Loading data from {filepath}')
    df = pd.read_parquet(filepath)
    return df

In [3]:
def preprocess_data(df):
    df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60
    return df

In [4]:
def filter_data(df, min_duration=1, max_duration=60):
    print_state('Filtering data based on duration')
    df_filtered = df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)].copy()
    return df_filtered

In [5]:
def prepare_data(df):
    print_state('Preparing data for model training')
    df['PULocationID'] = df['PULocationID'].astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype(str)
    data_dicts = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
    dv = DictVectorizer()
    X = dv.fit_transform(data_dicts)
    y = df['duration'].values
    return X, y, dv

In [6]:
def train_model(X, y):
    print_state('Training the model')
    model = LinearRegression()
    model.fit(X, y)
    return model

In [7]:
def evaluate_model(model, X, y):
    print_state('Evaluating the model')
    y_pred = model.predict(X)
    rmse = mean_squared_error(y, y_pred, squared=False)
    return rmse

In [8]:
def transform_validation_data(dv, df):
    print_state('Transforming validation data')
    df['PULocationID'] = df['PULocationID'].astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype(str)
    data_dicts_val = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
    X_val = dv.transform(data_dicts_val)
    y_val = df['duration'].values
    return X_val, y_val

In [9]:
# Load and preprocess training data
train_filepath = '../data/yellow_tripdata_2023-01.parquet'
df_train = load_data(train_filepath)

df_train_len = len(df_train)
print_state(f'Read the data for January. How many columns are there?: {df_train.shape[1]}')

df_train = preprocess_data(df_train)
df_train_duration_std = df_train['duration'].std()
print_state(f'What\'s the standard deviation of the trips duration in January?: {df_train_duration_std}')

df_train_filtered = filter_data(df_train)

fraction_left = len(df_train_filtered) / df_train_len
print_state(f'What fraction of the records left after you dropped the outliers?: {fraction_left:.2f}')

X_train, y_train, dv = prepare_data(df_train_filtered)

print_state(f'What\'s the dimensionality of this matrix (number of columns)?: {X_train.shape[1]}')

# Train the model
model = train_model(X_train, y_train)
rmse_train = evaluate_model(model, X_train, y_train)
print_state(f'What\'s the RMSE on train?: {rmse_train}')

[INFO] Loading data from ../data/yellow_tripdata_2023-01.parquet
[INFO] Read the data for January. How many columns are there?: 19
[INFO] What's the standard deviation of the trips duration in January?: 42.59435124195458
[INFO] Filtering data based on duration
[INFO] What fraction of the records left after you dropped the outliers?: 0.98
[INFO] Preparing data for model training
[INFO] What's the dimensionality of this matrix (number of columns)?: 515
[INFO] Training the model
[INFO] Evaluating the model
[INFO] What's the RMSE on train?: 7.649261929201487


In [10]:
# Load and preprocess validation data
val_filepath = '../data/yellow_tripdata_2023-02.parquet'
df_val = load_data(val_filepath)
df_val = preprocess_data(df_val)
df_val_filtered = filter_data(df_val)
X_val, y_val = transform_validation_data(dv, df_val_filtered)

# Evaluate the model on validation data
rmse_val = evaluate_model(model, X_val, y_val)
print_state(f'What\'s the RMSE on validation?: {rmse_val}')

[INFO] Loading data from ../data/yellow_tripdata_2023-02.parquet


[INFO] Filtering data based on duration
[INFO] Transforming validation data
[INFO] Evaluating the model
[INFO] What's the RMSE on validation?: 7.811819793542861
