In [1]:
print("Hello")

Hello


In [2]:
! pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-20.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [1]:
# Reading a parquet file
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

taxi_df = pd.read_parquet(r'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

# 1. Total Column
print(taxi_df.shape)
# Column names
print(taxi_df.columns)

(3066766, 19)
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')


In [2]:
# Calculating duration in minutes
# Converting to datetime
taxi_df['duration'] = taxi_df['tpep_dropoff_datetime'] - taxi_df['tpep_pickup_datetime']
display(taxi_df.head())

print(taxi_df['duration'].dt.total_seconds().std() / 60)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,0 days 00:08:26
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,0 days 00:06:19
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,0 days 00:12:45
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,0 days 00:09:37
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,0 days 00:10:50


42.594351241954534


- OneHotEncode VS pd.get_dummies vs DictVectorizer (Easier to deploy for MLOps)
- https://github.com/fonsecagabriella/data_science/blob/main/00_general_notes/encoding_summary.ipynb

In [3]:
# Looking for outliers in duration and keeping only the rows with duration less than 60 minutes and greater than 1 minute
taxi_df_new = taxi_df.copy()

taxi_df_new = taxi_df_new[(taxi_df['duration'].dt.total_seconds() / 60 < 60) & (taxi_df_new['duration'].dt.total_seconds() / 60 > 1)]

# 3. Fraction of the rows that are kept
print(taxi_df_new.shape[0])
print(taxi_df.shape[0])
print((taxi_df_new .shape[0] / taxi_df.shape[0]) * 100)

3008849
3066766
98.11146334607858


In [4]:
# One hot encoding on pickup and dropoff locations IDs.
# Turning the dataframe into a list of dictionaries and re-casting the IDs to strings
taxi_df_new['PULocationID'] = taxi_df_new['PULocationID'].astype(str)
taxi_df_new['DOLocationID'] = taxi_df_new['DOLocationID'].astype(str)

# Fitting a dictionary vectorizer to the pickup and dropoff location IDs
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

# The DictVectorizer is a transformer that converts categorical variables into a one-hot encoded matrix
# It takes a list of dictionaries as input and converts it into a matrix of 0s and 1s
# The matrix is sparse by default, meaning that it only stores the non-zero values and their indices
# This is useful for large datasets with many categorical variables
# It is also useful for datasets with many missing values, as it only stores the non-missing values
# It is less useful for small datasets with few categorical variables, as it can lead to a large number of columns


dv = DictVectorizer(sparse=True) # Sparse=False to get a dense matrix

# Feature matrix
# The feature matrix is a matrix of dictionaries, where each dictionary represents a row in the dataframe
# The keys of the dictionary are the column names and the values are the values in that row
# The target variable is the duration in minutes
X = dv.fit_transform(taxi_df_new[['PULocationID', 'DOLocationID']].to_dict(orient='records')) # to_dict(orient='records') converts the dataframe to a list of dictionaries, orint='records' means that each dictionary is a row in the dataframe. Orient='columns' means that each dictionary is a column in the dataframe. This is beneficial for the DictVectorizer because it expects a list of dictionaries as input.
y = taxi_df_new['duration'].dt.total_seconds() / 60

In [None]:
# 4. Dimensions of the feature matrix
print(X.shape)

(3008849, 515)


In [None]:
# Training a Linear Regression model with default parameters using the feature matrix and duration in minutes as the target/response variable
model = LinearRegression()
model.fit(X, y)

# 5. RMSE of the model
from sklearn.metrics import mean_squared_error
import numpy as np
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(rmse)
# RMSE is the root mean squared error, which is a measure of how well the model fits the data for the training set
# The RMSE is calculated as the square root of the mean of the squared differences between the predicted and actual values

7.6475120377560195


In [7]:
# Test data 
test_df = pd.read_parquet(r'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')
display(test_df.head())

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.8,1.0,N,132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0


### Why Do We Use `transform()` Instead of `fit_transform()` on Test Data?

When evaluating a machine learning model on test data, it is critical to avoid **data leakage** — that is, letting information from the test set influence the training process.

#### ✅ `fit_transform()`:
- This method **fits** the vectorizer (learns the feature mapping) and then **transforms** the data.
- If we run `fit_transform()` on the test data, we allow it to learn a new feature space based on unseen data, which **violates the principles of proper model evaluation**.
- This leads to **inconsistent feature mappings** between training and testing data, potentially causing incorrect or misleading results.

#### ✅ `transform()`:
- This method uses the **already learned feature mapping** from the training data.
- It ensures the model evaluates test data **under the same conditions** as it was trained.
- This is essential for a **fair and valid evaluation** of model performance.

In short, we use:

- `fit_transform()` on training data
- `transform()` on test data

to ensure the model is evaluated realistically and without bias.


Always call .copy() after filtering a DataFrame if plan to modify it later.

In [12]:
test_df = test_df.copy()  # Safe: ensures it's a new, independent copy

# With chaining
# test_df = test_df.loc[:, :].copy()

# Compute duration
test_df['duration'] = test_df['tpep_dropoff_datetime'] - test_df['tpep_pickup_datetime']

# Filter unreasonable durations
test_df = test_df[(test_df['duration'].dt.total_seconds() / 60 < 60) & (test_df['duration'].dt.total_seconds() / 60 > 1)]

# Convert location IDs to strings
test_df['PULocationID'] = test_df['PULocationID'].astype(str)
test_df['DOLocationID'] = test_df['DOLocationID'].astype(str)

# Transform features using the already-fitted DictVectorizer
X_test = dv.transform(test_df[['PULocationID', 'DOLocationID']].to_dict(orient='records'))

# Extract target values
y_test = test_df['duration'].dt.total_seconds() / 60

# Predict and calculate RMSE
y_pred_test = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print('Test RMSE:', rmse_test)

Test RMSE: 7.808398466759152
