In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import root_mean_squared_error

## QUESTION 1 : Downloading the data
Read the data for January. How many columns are there?

In [2]:
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [3]:
df_train.columns,len(df_train.columns)

(Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
        'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
        'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
        'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
        'total_amount', 'congestion_surcharge', 'airport_fee'],
       dtype='object'),
 19)

## QUESTION 2 : Computing Duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.<br>
What's the standard deviation of the trips duration in January?


In [4]:
df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)
df_train['duration'].std()

42.59435124195458

## QUESTION 3 : Dropping Outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).<br>

What fraction of the records left after you dropped the outliers?

In [5]:
(len(df_train[(df_train.duration>=1)&(df_train.duration<=60)])/len(df_train))*100

98.1220282212598

In [6]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

## QUESTION 4 : One-hot Encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.<br>

<ul><li>Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)</li>
<li>Fit a dictionary vectorizer</li>
<li>Get a feature matrix from it</li></ul>
What's the dimensionality of this matrix (number of columns)?

In [7]:
categorical = ['PULocationID', 'DOLocationID']

df_train[categorical] = df_train[categorical].astype(str)

train_dicts = df_train[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [8]:
X_train.shape

(3009173, 515)

## QUESTION 5 : Training a Model
Now let's use the feature matrix from the previous step to train a model.

<ul><li>Train a plain linear regression model with default parameters, where duration is the response variable</li>
<li>Calculate the RMSE of the model on the training data</li></ul>
What's the RMSE on train?

In [9]:
target = 'duration'
y_train = df_train[target].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_train_pred = lr.predict(X_train)
root_mean_squared_error(y_train, y_train_pred)

## QUESTION 6 : Evaluating the Model

Now let's apply this model to the validation dataset (February 2023).<br>

What's the RMSE on validation?

In [None]:
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]


In [None]:
df_val[categorical] = df_val[categorical].astype(str)
val_dicts = df_val[categorical].to_dict(orient='records')

X_val = dv.transform(val_dicts)

In [None]:
y_val = df_val[target].values

In [None]:
y_val_pred = lr.predict(X_val)
root_mean_squared_error(y_val, y_val_pred)