In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/cohorts/2023/01-intro/homework.md
# https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
df_yellow_jan22 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet")
df_yellow_feb22 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet")

In [2]:
print(df_yellow_jan22.shape)
print(df_yellow_feb22.shape)

(2463931, 19)
(2979431, 19)


In [3]:
# Q1. Downloading the data
# Read the data for January. How many columns are there?
len(df_yellow_jan22.columns)

19

In [4]:
# Q2. Computing duration
# What's the standard deviation of the trips duration in January?
(df_yellow_jan22.tpep_dropoff_datetime - df_yellow_jan22.tpep_pickup_datetime).astype('timedelta64[m]').std().round(2)

46.45

In [29]:
# Next, we need to check the distribution of the duration variable. 
# There are some outliers. Let's remove them and keep only the records
# where the duration was between 1 and 60 minutes (inclusive).
# What fraction of the records left after you dropped the outliers?
df_yellow_jan22["trip_duration_minutes"] = (df_yellow_jan22.tpep_dropoff_datetime - df_yellow_jan22.tpep_pickup_datetime).astype('timedelta64[m]')
len(df_yellow_jan22["trip_duration_minutes"][(df_yellow_jan22["trip_duration_minutes"]>=1) & (df_yellow_jan22["trip_duration_minutes"]<=60)]) / len(df_yellow_jan22)

0.9835198307095451

In [31]:
df_yellow_jan22 = df_yellow_jan22[(df_yellow_jan22["trip_duration_minutes"]>=1) & (df_yellow_jan22["trip_duration_minutes"]<=60)]

df_yellow_feb22["trip_duration_minutes"] = (df_yellow_feb22.tpep_dropoff_datetime - df_yellow_feb22.tpep_pickup_datetime).astype('timedelta64[m]')
df_yellow_feb22 = df_yellow_feb22[(df_yellow_feb22["trip_duration_minutes"]>=1) & (df_yellow_feb22["trip_duration_minutes"]<=60)]

df_yellow_jan22.shape, df_yellow_feb22.shape

((2423325, 20), (2921396, 20))

In [32]:
from sklearn.feature_extraction import DictVectorizer

# Q4. One-hot encoding
# Let's apply one-hot encoding to the pickup and dropoff location IDs. 
# We'll use only these two features for our model.
# Turn the dataframe into a list of dictionaries
# Fit a dictionary vectorizer
# Get a feature matrix from it
# What's the dimensionality of this matrix (number of columns)?

# https://github.com/particle1331/mlops-zoomcamp-project/blob/main/homework/01-homework/01.ipynb
categorical = ['PULocationID', 'DOLocationID']

df_yellow_jan22[categorical] = df_yellow_jan22[categorical].astype(int).astype(str)
df_yellow_feb22[categorical] = df_yellow_feb22[categorical].astype(int).astype(str)

train_dicts = df_yellow_jan22[categorical].to_dict(orient='records')
valid_dicts = df_yellow_feb22[categorical].to_dict(orient='records')

dv = DictVectorizer()
dv.fit(train_dicts)

X_train = dv.transform(train_dicts)
y_train = df_yellow_jan22.trip_duration_minutes.values

X_valid = dv.transform(valid_dicts)
y_valid = df_yellow_feb22.trip_duration_minutes.values

X_train.shape, X_valid.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_yellow_feb22[categorical] = df_yellow_feb22[categorical].astype(int).astype(str)


((2423325, 515), (2921396, 515))

In [58]:
print("Descriptive Statistics Trip Duration Jan 22")
print(pd.Series(y_train).describe().round(2))
print()
print("Descriptive Statistics Trip Duration Feb 22")
print(pd.Series(y_valid).describe().round(2))

Descriptive Statistics Trip Duration Jan 22
count    2423325.00
mean          12.22
std            9.10
min            1.00
25%            6.00
50%           10.00
75%           16.00
max           60.00
dtype: float64

Descriptive Statistics Trip Duration Feb 22
count    2921396.00
mean          13.36
std            9.77
min            1.00
25%            7.00
50%           11.00
75%           17.00
max           60.00
dtype: float64


In [61]:
# Q5. Training a model
# Now let's use the feature matrix from the previous step to train a model.
# Train a plain linear regression model with default parameters
# Calculate the RMSE of the model on the training data
# What's the RMSE on train?

# https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

In [67]:
# Make predictions using the testing set
y_train_pred = regr.predict(X_train)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_train_pred))
# The mean squared error
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_train, y_train_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_train, y_train_pred))

Mean squared error: 49.23
Root mean squared error: 7.02
Coefficient of determination: 0.41


In [69]:
# Make predictions using the testing set
y_valid_pred = regr.predict(X_valid)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_valid, y_valid_pred))
# The root mean squared error
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y_valid, y_valid_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_valid, y_valid_pred))

Mean squared error: 61.27
Root mean squared error: 7.83
Coefficient of determination: 0.36
