# DataTalkClub Mlops Notebooks
## Introduction Home Work

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
data = pd.read_parquet("/kaggle/input/zoomcamp-mlops-data/nyc-2022-jan-01.parquet")
data.shape

(2463931, 19)

In [None]:
data_prev_rows = data.shape[0]
data_prev_cols = data.shape[1]

In [5]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [6]:
data_columns = list(data.columns)
data_columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee']

In [7]:
data['trip_duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']

In [8]:
data.shape

(2463931, 20)

In [15]:
data_prev_rows = data.shape[0]
data_prev_cols = data.shape[1]

print(f"Total Rows: {data_prev_rows}")
print(f"Total Cols: {data_prev_cols}")

Total Rows: 2463931
Total Cols: 20


In [10]:
data['trip_duration'].head(10)

0   0 days 00:17:49
1   0 days 00:08:24
2   0 days 00:08:58
3   0 days 00:10:02
4   0 days 00:37:32
5   0 days 00:29:33
6   0 days 00:14:08
7   0 days 00:09:41
8   0 days 00:14:47
9   0 days 00:04:36
Name: trip_duration, dtype: timedelta64[ns]

In [12]:
data['trip_duration'] = data.trip_duration.dt.total_seconds() /60
data['trip_duration'].head()

0    17.816667
1     8.400000
2     8.966667
3    10.033333
4    37.533333
Name: trip_duration, dtype: float64

## Mean of trip duration

In [13]:
data['trip_duration'].mean()

14.212202918831741

## Standard Deviation

In [14]:
data['trip_duration'].std()

46.44530513776802

## Outlier detection and remval from the main data

In [18]:
lower_lim = 1
upper_lim = 60
data_filtered = data[(data['trip_duration']) >=lower_lim & (data['trip_duration'] <= upper_lim)].copy()

In [19]:
data_filtered_rows = data_filtered.shape[0]
data_filtered_cols = data_filtered.shape[1]

print(f"Total Rows: {data_filtered_rows}")
print(f"Total Cols: {data_filtered_cols}")

Total Rows: 2433928
Total Cols: 20


In [20]:
# percentage of data left

(data_filtered_rows/data_prev_rows) * 100

98.78231167999428

## One Hot Encoding

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
categorical = ["PULocationID", "DOLocationID"]
data_filtered[categorical] = data_filtered[categorical].fillna(-1).astype("int")

data_filtered[categorical] = data_filtered[categorical].astype("str")

In [26]:
dict_list = data_filtered[categorical].to_dict(orient="records")

vect = DictVectorizer()


In [27]:
X_train = vect.fit_transform(dict_list)

X_train.shape

(2433928, 517)

In [33]:
y_train = data_filtered.trip_duration.values

## Number of feature

In [30]:
len(vect.feature_names_)

517

In [31]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [34]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [35]:
y_preds = linear_reg.predict(X_train)

In [38]:
mse_train = mean_squared_error(y_train, y_preds, squared=False)
mse_train

46.157942474248735

## Root Mean Squred Error in Train data

In [39]:
np.sqrt(mse_train)

6.793963679197051

## Validation data preprocessing

In [51]:
categorical = ["PULocationID", "DOLocationID"]
data_val = pd.read_parquet("/kaggle/input/zoomcamp-mlops-data/nyc-2022-fab-02.parquet")
data_val.shape

(2979431, 19)

In [52]:
data_val['trip_duration'] = data_val['tpep_dropoff_datetime'] - data_val['tpep_pickup_datetime']

In [53]:
data_val['trip_duration'] = data_val.trip_duration.dt.total_seconds() /60

data_val['trip_duration'].head()

0    12.433333
1    17.550000
2    23.650000
3    20.083333
4    26.316667
Name: trip_duration, dtype: float64

In [54]:
data_val.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,trip_duration
0,1,2022-02-01 00:06:58,2022-02-01 00:19:24,1.0,5.4,1.0,N,138,252,1,17.0,1.75,0.5,3.9,0.0,0.3,23.45,0.0,1.25,12.433333
1,1,2022-02-01 00:38:22,2022-02-01 00:55:55,1.0,6.4,1.0,N,138,41,2,21.0,1.75,0.5,0.0,6.55,0.3,30.1,0.0,1.25,17.55
2,1,2022-02-01 00:03:20,2022-02-01 00:26:59,1.0,12.5,1.0,N,138,200,2,35.5,1.75,0.5,0.0,6.55,0.3,44.6,0.0,1.25,23.65
3,2,2022-02-01 00:08:00,2022-02-01 00:28:05,1.0,9.88,1.0,N,239,200,2,28.0,0.5,0.5,0.0,3.0,0.3,34.8,2.5,0.0,20.083333
4,2,2022-02-01 00:06:48,2022-02-01 00:33:07,1.0,12.16,1.0,N,138,125,1,35.5,0.5,0.5,8.11,0.0,0.3,48.66,2.5,1.25,26.316667


In [55]:
data_val[categorical] = data_val[categorical].fillna(-1).astype("int")

data_val[categorical] = data_val[categorical].astype("str")

In [56]:
data_val_dict = data_val[categorical].to_dict(orient="records")

In [57]:
X_validation = vect.transform(data_val_dict)

In [58]:
y_validation = data_val.trip_duration.values

In [60]:
y_val_preds = linear_reg.predict(X_validation)

In [62]:
mean_sqr_val_error = mean_squared_error(y_validation, y_val_preds, squared=False)


In [63]:
mean_sqr_val_error

46.81526960443868

In [65]:
np.sqrt(mean_sqr_val_error)

6.8421684869958215