In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#### Q1. Downloading the data

Read the data for January. How many records are there?

In [2]:
df = pd.read_parquet("./data/fhv_tripdata_2021-01.parquet")

In [3]:
df.shape

(1154112, 7)

#### Q2. Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the average trip duration in January?

In [4]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [5]:
df.duration.mean()

19.167224093791006

In [6]:
df.groupby(by=[pd.DatetimeIndex(df['dropOff_datetime']).month]).duration.mean()

dropOff_datetime
1         18.755341
2        105.907463
3      40347.600000
11    423371.050000
Name: duration, dtype: float64

Answer is `19.16`, but real answer is `18.755341`

#### Data preparation

Check the distribution of the duration variable. There are some outliers.

Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

How many records did you drop?

In [7]:
df  = df[['PUlocationID', 'DOlocationID', 'duration']]

In [8]:
df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

In [9]:
df.describe()[['duration']]

Unnamed: 0,duration
count,1109826.0
mean,16.24725
std,11.5515
min,1.0
25%,7.85
50%,13.23333
75%,21.46667
max,60.0


In [10]:
df.shape

(1109826, 3)

Answer is `1154112 - 1109826 = 44286`

#### Q3. Missing values

The features we'll use for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1".

What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.

In [11]:
df = df.fillna(-1)

In [12]:
df[df['PUlocationID'] == -1].shape[0] / df.shape[0]

0.8352732770722617

Answer is `83%`

#### Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries
- Fit a dictionary vectorizer
- Get a feature matrix from it

What's the dimensionality of this matrix? (The number of columns).

In [None]:
categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].astype(str)
df_dicts = df[categorical].to_dict(orient='records')

In [None]:
dv = DictVectorizer()

X_train = dv.fit_transform(df_dicts)
X_train

In [None]:
X_train.shape

Answer is `525`

#### Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters
- Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [None]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

Answer is `10.52`

#### Q6. Evaluating the model

Now let's apply this model to the validation dataset (Feb 2021).

What's the RMSE on validation?

In [None]:
df_val = pd.read_parquet("./data/fhv_tripdata_2021-02.parquet")
df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime
df_val['duration'] = df_val.duration.dt.total_seconds() / 60
df_val = df_val[(df_val["duration"] >= 1) & (df_val["duration"] <= 60)].copy()
df_val[categorical] = df_val[categorical].fillna(-1).astype('str')
val_dicts = df_val[categorical].to_dict(orient='records')

X_val = dv.transform(val_dicts)
y_val = df_val[target].values

y_pred_val = lr.predict(X_val)

mean_squared_error(y_val, y_pred_val, squared=False)

Answer is `11.01`