# duration prediction model
2023-05-21  
[mlops-zoomcamp](https://github.com/DataTalksClub/mlops-zoomcamp/tree/main)

### imports & check python version

In [None]:
!python -V

import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

#### load data

In [None]:
jan = pd.read_parquet('../data/yellow_tripdata_2022-01.parquet')

# Q1. how many columns in jan data?
jan.info()

In [None]:
jan['duration'] = jan['tpep_dropoff_datetime'] - jan['tpep_pickup_datetime']
jan['duration'] = jan['duration'].apply(lambda td: td.total_seconds() / 60)

# Q2. stdev of jan trip duration
jan['duration'].describe()

In [None]:
# Q3. drop outliers, what fraction of full dataset is left?
jan_dropped_outliers = jan[(jan['duration'] >= 1) & (jan['duration'] <=60)]
jan_dropped_outliers.shape[0] / jan.shape[0]

In [None]:
# Q4. one-hot encoding - dimensionality of resulting matrix?
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

jan_dropped_outliers[categorical] = jan_dropped_outliers[categorical].astype(str)

# train_dicts = jan_dropped_outliers[categorical + numerical].to_dict(orient='records')
train_dicts = jan_dropped_outliers[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train.shape[1]

In [None]:
# Q5. what is the RSME on training data?
target = 'duration'
y_train = jan_dropped_outliers[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

In [None]:
# Q6. what is the RSME on validation data?
feb = pd.read_parquet('../data/yellow_tripdata_2022-02.parquet')

feb['duration'] = feb['tpep_dropoff_datetime'] - feb['tpep_pickup_datetime']
feb['duration'] = feb['duration'].apply(lambda td: td.total_seconds() / 60)

feb_dropped_outliers = feb[(feb['duration'] >= 1) & (feb['duration'] <=60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

feb_dropped_outliers[categorical] = feb_dropped_outliers[categorical].astype(str)

# test_dicts = feb_dropped_outliers[categorical + numerical].to_dict(orient='records')
test_dicts = feb_dropped_outliers[categorical].to_dict(orient='records')

X_test = dv.transform(test_dicts)
y_test = feb_dropped_outliers['duration'].values

validation_preds = lr.predict(X_test)
mean_squared_error(y_test, validation_preds, squared=False)

In [None]:
# with open('../models/lin_reg.bin', 'wb') as f_out:
#     pickle.dump((dv, lr), f_out)