In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
df = pd.read_parquet('./datasets/yellow_tripdata_2023-01.parquet')
prev_row_counts = df.shape[0]

print("Number of columns:", df.columns.shape[0])
print("Columns:", *list("- " + df.columns), sep="\n")

Number of columns: 19
Columns:
- VendorID
- tpep_pickup_datetime
- tpep_dropoff_datetime
- passenger_count
- trip_distance
- RatecodeID
- store_and_fwd_flag
- PULocationID
- DOLocationID
- payment_type
- fare_amount
- extra
- mta_tax
- tip_amount
- tolls_amount
- improvement_surcharge
- total_amount
- congestion_surcharge
- airport_fee


In [3]:
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
print(f"Standard deviation: {df['duration'].std():.2f}")

df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
cleaned_pct = df.shape[0] / prev_row_counts
print(f"Outlier fraction: {cleaned_pct:1.0%}")

Standard deviation: 42.59
Outlier fraction: 98%


In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

df_train = read_dataframe('./datasets/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('./datasets/yellow_tripdata_2023-02.parquet')

In [5]:
categorical = ['PULocationID', 'DOLocationID']
train_dicts = df_train[categorical].to_dict(orient='records')
val_dicts = df_val[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [6]:
print("Encoded matrix dimensionality:", X_train.shape[1])

Encoded matrix dimensionality: 515


In [7]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print(f"RMSE train: {root_mean_squared_error(y_train, lr.predict(X_train)):.2f}")
print(f"RMSE validation: {root_mean_squared_error(y_val, lr.predict(X_val)):.2f}")

RMSE train: 7.65
RMSE validation: 7.81
