In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')

In [3]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [4]:
# Question 1: How many columns are in the DataFrame?
df.shape[1]

19

In [5]:
# Question 2: What's the standard deviation of the trips duration in January?
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'] = df['duration'].dt.total_seconds() / 60
df['duration'].std()

34.851053592192876

In [6]:
# Question 3: What fraction of the records left after you dropped the outliers?
df_1 = df.copy()
df_1 = df_1[(df_1['duration'] >= 1) & (df_1['duration'] <= 60)]
df_1.shape[0] / df.shape[0] * 100

97.78326020432945

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
Fit a dictionary vectorizer
Get a feature matrix from it
What's the dimensionality of this matrix (number of columns)?

In [7]:
for col in df_1.columns:
    if "id" in col.lower():
        df_1[col] = df_1[col].astype("string")
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2898906 entries, 0 to 2964623
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               string        
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             string        
 6   store_and_fwd_flag     object        
 7   PULocationID           string        
 8   DOLocationID           string        
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee            floa

In [8]:
df_2 = df_1.rename(columns={"PULocationID":"pickup_id","DOLocationID":"dropoff_id"})
encoder = OneHotEncoder(handle_unknown='ignore')
for col in ['pickup_id','dropoff_id']:
    df_2[col] = df_2[col].astype("string")
data = df_2[['pickup_id','dropoff_id']]
feature_matrix = encoder.fit_transform(data)
print(f"Feature matrix dimensions: {feature_matrix.shape[1]}")

Feature matrix dimensions: 518


Now let's use the feature matrix from the previous step to train a model.

* Train a plain linear regression model with default parameters, where duration is the response variable
* Calculate the RMSE of the model on the training data

What's the RMSE on train?

* 3.64
* 7.64
* 11.64
* 16.64

In [9]:
X,y = feature_matrix, df_2['duration'].values

In [10]:
model = LinearRegression(n_jobs=-1)
model.fit(X,y)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
print(f"RMSE score: {rmse}")

RMSE score: 7.946174399033673


In [11]:
df_validation = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')
df_validation = df_validation.rename(columns={"PULocationID":"pickup_id","DOLocationID":"dropoff_id"})
for col in ['pickup_id','dropoff_id']:
    df_validation[col] = df_validation[col].astype("string")
df_validation["duration"] = df_validation["tpep_dropoff_datetime"] - df_validation["tpep_pickup_datetime"]
df_validation['duration'] = df_validation['duration'].dt.total_seconds() / 60
df_validation = df_validation[(df_validation['duration'] >= 1) & (df_validation['duration'] <= 60)]
validation_data = df_validation[['pickup_id','dropoff_id']].copy()
# One-hot encoding (using the same encoder fitted on training data)
X_validation = encoder.transform(validation_data)

# Target variable for validation set
y_validation = df_validation['duration'].values

# Predict on the validation data
y_validation_pred = model.predict(X_validation)

# Calculate RMSE on validation set
rmse_validation = np.sqrt(mean_squared_error(y_validation, y_validation_pred))
rmse_validation = np.sqrt(mean_squared_error(y_validation,y_validation_pred))
print(f"RMSE on validation: {rmse_validation}")

RMSE on validation: 7.816005281596206
