In [12]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
import pickle

The homework looks into the YELLOW taxi records for January and february of 2023, so we will use those instead.

# Q1. Downloading the data. 
For the january data of 2023 yellow taxis, how many columns are there:

In [13]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
print('Number of columns: ', len(df.columns))


Number of columns:  19


# Q2. Computing duration
What is the standard deviation of the trips duration in january?
We calculate the duration of each trip in minutes.

In [14]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(f'The standard deviation of the trips before we downselect is: {df.duration.std():.2f}')


The standard deviation of the trips before we downselect is: 42.59


# Q3. Dropping outliers
We will filter so that we only keep data betwen 1 and 60 mins (inclusive).
What fraction of the recorsd are left after you dropped the outliers?

In [15]:
mask = ((df.duration >=1) & (df.duration<=60))
df = df[mask]
total_n = len(mask)
filtered_n = mask.sum()
print(f'percentage left after dropping outliers is {filtered_n/total_n*100:.0f}%')

percentage left after dropping outliers is 98%


# Q4. One hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it
   
What's the dimensionality of this matrix (number of columns)?

In [None]:
categorical = ['PULocationID', 'DOLocationID'] # pickup and dropoff location
numerical = ['trip_distance'] # distance of trip
df[categorical] = df[categorical].astype(str)


Now we will do one hot encoding (converting things to 0s and 1s).
First we will convert the categoricals to just objects.
The reason to make the categoricals strings is because otherwise the dataframe will label encode them.

In [None]:
df[categorical].dtypes #object

We convert each row to a dictionary, and we will use the dictionary vectorizer to convert the dictionary to just vectors.

In [None]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

In [None]:
train_dicts[:10]

In [None]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

we can check the no of columns and the feature names for each column

In [None]:
dv.feature_names_[:20]

In [None]:
print(f'no of columns on one hot: {X_train.shape[1]}')

Now that we have the X variables, we also need to set the target

In [None]:
target = 'duration'
y_train = df[target].values

In [None]:
y_train

Now that we have the x and y for training, we can fit a model to this.
Remember, this is basically:
duration = f(dropoff location, pickup location, trip distance)

We will fit a simple linear regression model:

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

Now we will predict the duration based on the training data

In [None]:
y_pred = lr.predict(X_train)


In [None]:
sns.distplot(y_pred,label='prediction')
sns.distplot(y_train,label='actual')
plt.legend()

The prediction and the actual values look pretty different. This means that the model is probably not particularly good. We can calcualte the performance of this model using the root mean square error RMSE.

In [None]:
train_rmse = root_mean_squared_error(y_train,y_pred)
print(f'RMSE from train data is : {train_rmse:.2f}')

We should check now how it performs with the validation dataset. We will create a function that we can use to preprocess the data

In [None]:
def read_dataframe(filename):
    df = pd.read_parquet(filename) #read data
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime # calculate duration
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) # convert to mins
    mask = ((df.duration >=1) & (df.duration<=60)) # select only significant data
    df = df[mask]
    categorical = ['PULocationID', 'DOLocationID'] # pickup and dropoff location
    df[categorical] = df[categorical].astype(str) # convert to string/object to prevent labeling

    return df

Now we read in both the training and the validation data using this function:

In [None]:
df_train = read_dataframe('./data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2023-02.parquet')

We check the length of each dataframe

In [None]:
len(df_train),len(df_val)

We repeat the one-hotting and calculate RMSE. Note that it got worse, and if looking at the video, lasso and ridge models actually do not fare any better.

In [None]:
dv = DictVectorizer()
lr = LinearRegression()
categorical = ['PULocationID', 'DOLocationID'] # pickup and dropoff location
numerical = ['trip_distance'] # distance of trip

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts) # no need to fit sicne its done already for training data

y_train = df_train['duration'].values
y_val = df_val['duration'].values

lr.fit(X_train,y_train)

y_pred_train = lr.predict(X_train)
y_pred_val   = lr.predict(X_val)

train_rmse = root_mean_squared_error(y_train,y_pred_train)
val_rmse   = root_mean_squared_error(y_val,y_pred_val)

print(f'RMSE from train data is : {train_rmse:.2f}')
print(f'RMSE from validation data is : {val_rmse:.2f}')


To improve things, we combine the dropoff/pickup locations into one variable and repeat the process

In [None]:
print('combining pickup location ID and dropoff location ID...')
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

dv = DictVectorizer()
lr = LinearRegression()
categorical = ['PU_DO'] # pickup and dropoff location
numerical = ['trip_distance'] # distance of trip

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts) # no need to fit sicne its done already for training data

y_train = df_train['duration'].values
y_val = df_val['duration'].values

lr.fit(X_train,y_train)

y_pred_train = lr.predict(X_train)
y_pred_val   = lr.predict(X_val)

train_rmse = root_mean_squared_error(y_train,y_pred_train)
val_rmse   = root_mean_squared_error(y_val,y_pred_val)

print(f'RMSE from train data is : {train_rmse:.2f}')
print(f'RMSE from validation data is : {val_rmse:.2f}')

Linear regression works best for the data. Lasso and ridge tend to fare same or worse.

In [None]:
lr_lasso = Lasso(0.001)
lr_lasso.fit(X_train,y_train)

y_pred_train = lr_lasso.predict(X_train)
y_pred_val   = lr_lasso.predict(X_val)

train_rmse = root_mean_squared_error(y_train,y_pred_train)
val_rmse   = root_mean_squared_error(y_val,y_pred_val)

print(f'RMSE from train data using Lasso is : {train_rmse:.2f}')
print(f'RMSE from validation data using Lasso is : {val_rmse:.2f}')

In [None]:
lr_ridge = Ridge(alpha=0.001)
lr_ridge.fit(X_train,y_train)

y_pred_train = lr_ridge.predict(X_train)
y_pred_val   = lr_ridge.predict(X_val)

train_rmse = root_mean_squared_error(y_train,y_pred_train)
val_rmse   = root_mean_squared_error(y_val,y_pred_val)

print(f'RMSE from train data using Ridge is : {train_rmse:.2f}')
print(f'RMSE from validation data using Ridge is : {val_rmse:.2f}')

Note that lasso takes the longest because we're looking at 60 rows.

Now we can save the model using pickle and stuff

In [None]:
with open('./models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)