In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
import pickle

In [2]:
def read_dataframe(link):
    df = pd.read_parquet(link)
    df['duration'] = df.tpep_dropoff_datetime-df.tpep_pickup_datetime
    df.duration=df.duration.apply(lambda td:td.total_seconds()/60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', "DOLocationID"]
    numerical = ['trip_distance']
    
    df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
df_train=read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")

In [4]:
df_val=read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [5]:
len(df_train)

3009173

In [6]:
len(df_val)

2855951

In [7]:
df_train['PUDO']=df_train['PULocationID']+'_'+df_train["DOLocationID"]

In [8]:
df_val['PUDO']=df_val['PULocationID']+'_'+df_val["DOLocationID"]

In [9]:
categorical = ['PUDO']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)


val_dicts = df_val[categorical + numerical].to_dict(orient='records')

X_val = dv.transform(val_dicts)



In [10]:
y_train = df_train['duration'].values

y_val = df_val['duration'].values

In [11]:
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_val)

mean_squared_error(y_val,y_pred,squared=False)

5.198295479690269

lr = Lasso(alpha=0.001)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_val)

mean_squared_error(y_val,y_pred,squared=False)

lr = Ridge(alpha=0.001)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_val)

mean_squared_error(y_val,y_pred,squared=False)

In [12]:
pwd

'/workspaces/mlops/01-intro'

In [13]:
with open('Models/lin_reg.bin','wb') as linear_regression:
    pickle.dump((dv,lr),linear_regression)