In [1]:
!python -V

Python 3.10.13


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [5]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [6]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet')

In [7]:
len(df_train), len(df_val)

(59603, 66097)

In [13]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [19]:
from sklearn.pipeline import make_pipeline

In [22]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')


In [20]:
pipeline = make_pipeline(
    DictVectorizer(),
    LinearRegression()
)

In [23]:
pipeline.fit(train_dicts, y_train)

In [27]:
y_pred = pipeline.predict(val_dicts)

mean_squared_error(y_val, y_pred, squared=False)

8.19383255249626

In [29]:
with open('lin_reg.bin', 'wb') as f_out:
    pickle.dump(pipeline, f_out)

In [1]:
import pickle

In [3]:
with open('lin_reg.bin', 'rb') as f_in:
    pipeline = pickle.load(f_in)

In [7]:
pipeline.predict({'PULocation': '37', 'DOLocation': '10', 'distance': 10})

array([20.62459046])