# MLOps Zoomcamp - Homework 1

## Question 1

In [1]:
#Import libraries
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
#Read parquet files into DFs
df_01 = pd.read_parquet("data/yellow_tripdata_2022-01.parquet")
df_02 = pd.read_parquet("data/yellow_tripdata_2022-02.parquet")

In [3]:
#Inspect DFs
df_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463931 entries, 0 to 2463930
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [4]:
#Number of columns
df_01.shape

(2463931, 19)

## Question 2

In [5]:
#Compute duration variable for the two DFs
df_01 = df_01.assign(duration=df_01["tpep_dropoff_datetime"]-df_01["tpep_pickup_datetime"])
df_02 = df_02.assign(duration=df_02["tpep_dropoff_datetime"]-df_02["tpep_pickup_datetime"])

In [6]:
#Adjust duration units to minutes
df_01.duration = df_01.duration.apply(lambda td: td.total_seconds() / 60)
df_02.duration = df_02.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
#Calculate std of duration in Jan
df_01.duration.std()

46.44530513776499

## Question 3

In [8]:
#Count percentage of duration outliers
outliers_p = (df_01[(df_01["duration"]>=1) & (df_01["duration"]<=60)]["duration"].count()/df_01["duration"].count())*100
outliers_p

98.27547930522405

In [9]:
#Remove outliers
df_01_no_outliers = df_01[(df_01["duration"]>=1) & (df_01["duration"]<=60)]
df_01_no_outliers.shape

(2421440, 20)

## Question 4

In [10]:
#Cast cat variables as string
categorical = ['PULocationID', 'DOLocationID']
df_train = df_01_no_outliers[categorical].astype(str)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2421440 entries, 0 to 2463930
Data columns (total 2 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   PULocationID  object
 1   DOLocationID  object
dtypes: object(2)
memory usage: 55.4+ MB


In [11]:
#Create feature dictionary
train_dicts = df_train.to_dict(orient='records')

#Apply one-hot encoding to create feature matrix
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

X_train.shape

(2421440, 515)

## Question 5

In [12]:
target = "duration"
y_train = df_01_no_outliers[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

print("Root Mean Squared Error:", mean_squared_error(y_train, y_pred, squared=False))

Root Mean Squared Error: 6.98619123059128


## Question 6

In [13]:
#Remove outliers
df_02_no_outliers = df_02[(df_02["duration"]>=1) & (df_02["duration"]<=60)]
df_val = df_02_no_outliers[categorical].astype(str)
df_val.shape

(2918187, 2)

In [14]:
#Create feature dictionary
val_dicts = df_val.to_dict(orient='records')

#Apply one-hot encoding to create feature matrix
X_val = dv.fit_transform(val_dicts)

In [None]:
y_val = df_02_no_outliers[target].values

lr.fit(X_val, y_val)

y_pred = lr.predict(X_val)

print("Root Mean Squared Error:", mean_squared_error(y_val, y_pred, squared=False))