In [1]:
!python -V

Python 3.12.9


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [6]:
import mlflow



In [7]:
# mlflow.set_tracking_uri("sqlite:///mlflow.db")
# mlflow.set_experiment("nyc-taxi-experiment")

In [8]:
mlflow.set_experiment("My_Taxi_DataTalk_Experiment")

2025/05/22 11:53:30 INFO mlflow.tracking.fluent: Experiment with name 'My_Taxi_DataTalk_Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///media/bonisadar/MY%20DATA/Github%20repository/DataTalks/MLOps_2025/notebook/mlruns/871025750041304529', creation_time=1747893210392, experiment_id='871025750041304529', last_update_time=1747893210392, lifecycle_stage='active', name='My_Taxi_DataTalk_Experiment', tags={}>

In [9]:
df = pd.read_parquet('../data/raw/yellow_tripdata_2023-01.parquet')

In [10]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.30,1.00,0.5,0.00,0.0,1.0,14.30,2.5,0.00
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.10,1.0,N,43,237,1,7.90,1.00,0.5,4.00,0.0,1.0,16.90,2.5,0.00
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.90,1.00,0.5,15.00,0.0,1.0,34.90,2.5,0.00
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.90,1.0,N,138,7,1,12.10,7.25,0.5,0.00,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.40,1.00,0.5,3.28,0.0,1.0,19.68,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.80,0.00,0.5,3.96,0.0,1.0,23.76,,
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.80,,,112,75,0,22.43,0.00,0.5,2.64,0.0,1.0,29.07,,
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.00,0.5,5.32,0.0,1.0,26.93,,
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.00,0.5,4.43,0.0,1.0,26.58,,


In [11]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [12]:
df["PULocationID"].nunique(), df["DOLocationID"].nunique()

(257, 261)

In [13]:
df["PULocationID"].nunique() + df["DOLocationID"].nunique()

518

## Calculating Trip duration in minutes

In [14]:
df["trip_duration_us"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

In [15]:
df[['trip_duration_us']]

Unnamed: 0,trip_duration_us
0,0 days 00:08:26
1,0 days 00:06:19
2,0 days 00:12:45
3,0 days 00:09:37
4,0 days 00:10:50
...,...
3066761,0 days 00:13:59
3066762,0 days 00:19:27
3066763,0 days 00:24:31
3066764,0 days 00:13:00


In [16]:
df['trip_duration_us'] = df['trip_duration_us'].dt.total_seconds() / 60

In [17]:
df[['trip_duration_us']]

Unnamed: 0,trip_duration_us
0,8.433333
1,6.316667
2,12.750000
3,9.616667
4,10.833333
...,...
3066761,13.983333
3066762,19.450000
3066763,24.516667
3066764,13.000000


In [18]:
std_yellow_january_2023 = df['trip_duration_us'].std()

In [19]:
std_yellow_january_2023

42.594351241920904

## Removing outliers

In [20]:
df_outliers_removed = df[(df['trip_duration_us'] >= 1) & (df['trip_duration_us'] <= 60)]

In [21]:
print(df_outliers_removed["trip_duration_us"].min())  # should be ≥ 1.0
print(df_outliers_removed["trip_duration_us"].max())  # should be ≤ 60.0

1.0
60.0


In [22]:
remaining_fraction = len(df_outliers_removed) / len(df) * 100

In [23]:
print(f'Remaining fraction = {remaining_fraction:.4f}')

Remaining fraction = 98.1220


## One-hot encoding

In [24]:
X_cat = df_outliers_removed[['PULocationID', 'DOLocationID']].astype(str)

In [25]:
X_dict = X_cat.to_dict(orient='records')

In [26]:
vec = DictVectorizer(sparse=True)

In [27]:
X = vec.fit_transform(X_dict)

In [28]:
print(f"Feature matrix shape: {X.shape}")

Feature matrix shape: (3009173, 515)


In [29]:
n_columns = len(vec.get_feature_names_out())
print(f"Number of columns: {n_columns}")

Number of columns: 515


In [30]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6018346 stored elements and shape (3009173, 515)>

In [31]:
y = df_outliers_removed['trip_duration_us'].values

## Training a simple model

In [32]:
model_lr = LinearRegression()

In [33]:
model_lr.fit(X, y)

In [34]:
y_pred = model_lr.predict(X)

In [35]:
root_mean_squared_error(y, y_pred)

7.649262059978435

# Lesso

In [36]:
model_la = Lasso(0.1)

In [37]:
model_la.fit(X,y)

In [38]:
y_pred = model_la.predict(X)

In [39]:
root_mean_squared_error(y, y_pred)

8.599336126346433

# Logging using MLflow


In [40]:
# Start MLflow run
with mlflow.start_run():

    mlflow.set_tag("developer", "Boni")

    mlflow.log_param("path", "This is the path")

    alpha = 0.01
    mlflow.log_param('alpha', alpha)
    la = Lasso(alpha)
    la.fit(X, y)

    y_pred = la.predict(X)
    rmse = root_mean_squared_error(y, y_pred)
    mlflow.log_metric('rmse', rmse)

## Validation on February data

In [41]:
df_val = pd.read_parquet('../data/raw/yellow_tripdata_2023-02.parquet')

In [42]:
df_val.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.8,1.0,N,132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0


In [43]:
df_val["trip_duration_us"] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime

In [44]:
df_val[['trip_duration_us']]

Unnamed: 0,trip_duration_us
0,0 days 00:01:41
1,0 days 00:00:14
2,0 days 00:00:14
3,0 days 00:32:05
4,0 days 00:13:18
...,...
2913950,0 days 00:19:00
2913951,0 days 00:11:08
2913952,0 days 00:14:00
2913953,0 days 00:07:00


In [45]:
df_val['trip_duration_us'] = df_val['trip_duration_us'].dt.total_seconds() / 60

In [46]:
df_val[['trip_duration_us']]

Unnamed: 0,trip_duration_us
0,1.683333
1,0.233333
2,0.233333
3,32.083333
4,13.300000
...,...
2913950,19.000000
2913951,11.133333
2913952,14.000000
2913953,7.000000


In [47]:
std_yellow_february_2023 = df_val['trip_duration_us'].std()

In [48]:
std_yellow_february_2023

42.84210176105097

## Removing outliers

In [49]:
df_outliers_removed = df_val[(df_val['trip_duration_us'] >= 1) & (df_val['trip_duration_us'] <= 60)]

In [50]:
print(df_outliers_removed["trip_duration_us"].min())  # should be ≥ 1.0
print(df_outliers_removed["trip_duration_us"].max())  # should be ≤ 60.0

1.0
60.0


In [51]:
remaining_fraction = len(df_outliers_removed) / len(df) * 100

In [52]:
print(f'Remaining fraction = {remaining_fraction:.4f}')

Remaining fraction = 93.1258


## One-hot encoding

In [53]:
X_cat = df_outliers_removed[['PULocationID', 'DOLocationID']].astype(str)

In [54]:
X_dict = X_cat.to_dict(orient='records')

In [55]:
X = vec.transform(X_dict)

In [56]:
print(f"Feature matrix shape: {X.shape}")

Feature matrix shape: (2855951, 515)


In [57]:
n_columns = len(vec.get_feature_names_out())
print(f"Number of columns: {n_columns}")

Number of columns: 515


In [58]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5711894 stored elements and shape (2855951, 515)>

In [59]:
y = df_outliers_removed['trip_duration_us'].values

In [60]:
y_pred = model_lr.predict(X)

In [61]:
root_mean_squared_error(y, y_pred)

7.811818442402467