Download the files using:
* wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
* wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

        print('How many columns are there?', len(df.columns))

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    print('Whats the standard deviation of the trips duration in January?', df['duration'].std())

    print('What fraction of the records left after you dropped the outliers?', ((df.duration >= 1) & (df.duration <= 60)).mean())

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
df_train = read_dataframe('./yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('./yellow_tripdata_2023-02.parquet')

How many columns are there? 19
Whats the standard deviation of the trips duration in January? 42.59435124195458
What fraction of the records left after you dropped the outliers? 0.9812202822125979
How many columns are there? 19
Whats the standard deviation of the trips duration in January? 42.84210176105113
What fraction of the records left after you dropped the outliers? 0.9800944077722545


In [4]:
categorical = ['PULocationID', 'DOLocationID']

dv = DictVectorizer(sparse=True)

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values


lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
print('Training:', mean_squared_error(y_train, y_pred, squared=False))

y_pred = lr.predict(X_val)
print('Validation:', mean_squared_error(y_val, y_pred, squared=False))

Training: 7.649261932106969
Validation: 7.811818743246608


