In [6]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [7]:
def read_dataframe(filename: str, categorical: list[str]) -> pd.DataFrame:
    df = pd.read_parquet(filename)

    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].apply(lambda td: td.total_seconds() / 60)

    len_before = df.shape[0]
    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]
    len_after = df.shape[0]
    # 3
    print("%.2f %%" % ((len_after / len_before) * 100))

    df[categorical] = df[categorical].astype(str)

    return df

In [8]:
categorical = ["PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

In [9]:
df_train = read_dataframe(
    filename="../data/yellow_tripdata_2023-01.parquet", categorical=categorical
)
df_val = read_dataframe(
    filename="../data/yellow_tripdata_2023-02.parquet", categorical=categorical
)

98.12 %
98.01 %


In [10]:
# 1
len(df_train.columns)

20

In [11]:
# 2
df_train["duration"].std()

9.939385620151036

In [12]:
train_dicts = df_train[categorical + numerical].to_dict(orient="records")
val_dicts = df_val[categorical + numerical].to_dict(orient="records")

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
# 4
X_train.shape

(3009173, 516)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [14]:
# 5
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)



7.65839726314459

In [16]:
# 6
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)


7.820259863004852