In [25]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
import pickle

The homework looks into the YELLOW taxi records for January and february of 2023, so we will use those instead.

# Q1. Downloading the data. 
For the january data of 2023 yellow taxis, how many columns are there:

In [26]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
print('Number of columns: ', len(df.columns))


Number of columns:  19


# Q2. Computing duration
What is the standard deviation of the trips duration in january?
We calculate the duration of each trip in minutes.

In [27]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(f'The standard deviation of the trips before we downselect is: {df.duration.std():.2f}')


The standard deviation of the trips before we downselect is: 42.59


# Q3. Dropping outliers
We will filter so that we only keep data betwen 1 and 60 mins (inclusive).
What fraction of the recorsd are left after you dropped the outliers?

In [28]:
mask = ((df.duration >=1) & (df.duration<=60))
df = df[mask]
total_n = len(mask)
filtered_n = mask.sum()
print(f'percentage left after dropping outliers is {filtered_n/total_n*100:.0f}%')

percentage left after dropping outliers is 98%


# Q4. One hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it
   
What's the dimensionality of this matrix (number of columns)?

In [29]:
categorical = ['PULocationID', 'DOLocationID'] # pickup and dropoff location
numerical = ['trip_distance'] # distance of trip

df[categorical] = df[categorical].astype(str) # convert to strings to avoid labeling

# fit a dictionary vectorizer
dv = DictVectorizer()
train_dicts = df[categorical + numerical].to_dict(orient='records')

# get feature matrix
X_train = dv.fit_transform(train_dicts)

print(f'no of columns on one hot: {X_train.shape[1]-1}')

no of columns on one hot: 515


# Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters
- Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [30]:
target = 'duration'
y_train = df[target].values

# fit plain linear regression model w default parameters
lr = LinearRegression()
lr.fit(X_train,y_train)

# calculate RMSE of training data
y_pred = lr.predict(X_train)
train_rmse = root_mean_squared_error(y_train,y_pred)
print(f'RMSE from train data is : {train_rmse:.2f}')


RMSE from train data is : 7.66


# Q6. Evaluating the model
Now let's apply this model to the validation dataset (February 2023).

What's the RMSE on validation?


We should check now how it performs with the validation dataset. We will create a function that we can use to preprocess the data

In [31]:
def read_dataframe(filename):
    df = pd.read_parquet(filename) #read data
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime # calculate duration
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) # convert to mins
    mask = ((df.duration >=1) & (df.duration<=60)) # select only significant data
    df = df[mask]
    categorical = ['PULocationID', 'DOLocationID'] # pickup and dropoff location
    df[categorical] = df[categorical].astype(str) # convert to string/object to prevent labeling

    return df

Now we read in the validation data using this function:

In [34]:
df_val = read_dataframe('./data/yellow_tripdata_2023-02.parquet')

We repeat the one-hotting, predicting and calculate RMSE.

In [35]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts) # no need to fit sicne its done already for training data

# get actual and predicted values using fitted linear regression
y_val = df_val['duration'].values
y_pred_val   = lr.predict(X_val)

# calculate RMSE
val_rmse   = root_mean_squared_error(y_val,y_pred_val)
print(f'RMSE from validation data is : {val_rmse:.2f}')


RMSE from validation data is : 7.82
