### [mlops-zoomcamp](https://github.com/DataTalksClub/mlops-zoomcamp)

In [1]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [2]:
df_jan = pd.read_parquet('../data/fhv_tripdata_2021-01.parquet')

df_feb = pd.read_parquet('../data/fhv_tripdata_2021-02.parquet')

In [3]:
df_jan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1153227 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 61.6+ MB


In [4]:
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [5]:
df_jan['duration'] = df_jan['dropOff_datetime'] - df_jan['pickup_datetime']

df_feb['duration'] = df_feb['dropOff_datetime'] - df_feb['pickup_datetime']

In [6]:
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,0 days 00:17:00
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,0 days 00:17:00
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,0 days 01:50:00
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,0 days 00:08:17
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,0 days 00:15:13


In [7]:
df_jan['duration'] = df_jan['duration'].apply(lambda x: x.total_seconds() / 60)

df_feb['duration'] = df_feb['duration'].apply(lambda x: x.total_seconds() / 60)

In [8]:
df_jan['duration'].mean()

19.1672240937939

In [9]:
outliers_index = df_jan.loc[(df_jan['duration'] < 1) | (df_jan['duration'] > 60)].index

df_jan.iloc[outliers_index]

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.000000
16,B00111,2021-01-01 00:40:00,2021-01-01 01:41:00,,,,B03234,61.000000
19,B00112,2021-01-01 00:28:12,2021-01-01 00:28:18,,14.0,,B00112,0.100000
21,B00112,2021-01-01 00:25:56,2021-01-01 00:26:00,,14.0,,B00112,0.066667
24,B00131,2021-01-01 00:44:17,2021-01-01 00:44:20,,255.0,,B00131,0.050000
...,...,...,...,...,...,...,...,...
1153883,B01871,2021-01-31 23:44:00,2021-02-01 01:20:00,,,,B00837,96.000000
1153910,B02311,2021-01-31 23:04:37,2021-01-31 23:04:45,,155.0,,B02311,0.133333
1153929,B02563,2021-01-31 23:49:29,2021-02-01 02:08:46,,102.0,,B02563,139.283333
1154012,B02849,2021-01-31 23:48:26,2021-01-31 23:48:38,,256.0,,B02849,0.200000


In [10]:
df_jan.drop(index=outliers_index, axis=0, inplace=True)

In [11]:
df_jan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1109826 entries, 0 to 1154111
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1109826 non-null  object        
 1   pickup_datetime         1109826 non-null  datetime64[ns]
 2   dropOff_datetime        1109826 non-null  datetime64[ns]
 3   PUlocationID            182818 non-null   float64       
 4   DOlocationID            961919 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1109053 non-null  object        
 7   duration                1109826 non-null  float64       
dtypes: datetime64[ns](2), float64(3), object(3)
memory usage: 76.2+ MB


In [12]:
df_jan['PUlocationID'].isna().sum() / len(df_jan)

0.8352732770722617

In [13]:
df_jan['PUlocationID'].fillna(value=-1, inplace=True)
df_jan['DOlocationID'].fillna(value=-1, inplace=True)

In [14]:
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,-1.0,,B00009,17.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,-1.0,71.0,,B00037,9.05


In [15]:
categorical = ['PUlocationID', 'DOlocationID']

df_jan[categorical] = df_jan[categorical].astype(str)

In [29]:
train_dicts = df_jan[categorical].to_dict(orient='records')

In [30]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df_jan[target].values

In [33]:
X_train.get_shape()

(1109826, 525)

In [31]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.528519107204998

### Validation dataset

In [34]:
df_feb.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014,92.0
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.95
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037,13.8


In [35]:
df_feb['PUlocationID'].fillna(value=-1, inplace=True)
df_feb['DOlocationID'].fillna(value=-1, inplace=True)

In [36]:
categorical = ['PUlocationID', 'DOlocationID']

df_feb[categorical] = df_feb[categorical].astype(str)

In [37]:
val_dicts = df_feb[categorical].to_dict(orient='records')

In [38]:
X_val = dv.transform(val_dicts)
y_val = df_feb[target].values

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

160.9855747770896