# Training a ride duration prediction model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error
import mlflow
import xgboost as xgb
from hyperopt import hp, tpe, Trials, STATUS_OK, fmin
from hyperopt.pyll import scope



In [2]:
# mlflow ui --backend-store-uri sqlite:///mlflow.db
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2025/08/22 23:40:46 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/08/22 23:40:46 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025/08/22 23:40:46 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/bastienwinant/Desktop/projects/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1755898846791, experiment_id='1', last_update_time=1755898846791, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
def one_hot_encoding(df, categories, enc=None):
	df_categorical = df[categories].reset_index(drop=True)
	X_numerical = df.drop(categories, axis=1).reset_index(drop=True)

	if not enc:
		enc = OneHotEncoder(dtype=np.int32, handle_unknown='ignore')
		enc.fit(df_categorical)

	X_categorical = pd.DataFrame(
		data=enc.transform(df_categorical).toarray(),
		columns=enc.get_feature_names_out()
	)

	X = pd.concat([X_numerical, X_categorical], axis=1)

	return X, enc

In [4]:
def preprocessing(url, categorical, numerical, enc=None):
	# read in the raw data
	df = pd.read_parquet(url)
	df[categorical] = df[categorical].astype(str)

    # compute duration of a trip in minutes and minutes
	df['duration'] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime) / pd.Timedelta(minutes=1)
	df = df.loc[(df.duration >= 1) & (df.duration <= 60)]

	df['PU_DO'] = df.PULocationID.str.cat(df.DOLocationID, sep='_')
	categorical = ['PU_DO']

	df = df[categorical + numerical + ['duration']]
	X, enc = one_hot_encoding(df, categorical, enc=enc)

	y = X.duration
	X.drop('duration', axis=1, inplace=True)

	return X.values, y.values, enc

In [5]:
url_train = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet"
url_val = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet"

In [6]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [7]:
X_train, y_train, enc = preprocessing(
	url=url_train, categorical=categorical, numerical=numerical, enc=None)
X_val, y_val, _ = preprocessing(
	url=url_val, categorical=categorical, numerical=numerical, enc=enc)

### Logging Basics

In [8]:
alpha = .01
lasso = Lasso(alpha)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)

In [9]:
with mlflow.start_run():
	mlflow.set_tag("developer", "bastien winant")

	mlflow.log_param("train-data", url_train)
	mlflow.log_param("valid-data", url_val)

	mlflow.log_param("alpha", alpha)

	mlflow.log_metric("rmse", rmse)

### Hyperparameter Tuning

In [10]:
train = xgb.DMatrix(data=X_train, label=y_train)
valid = xgb.DMatrix(data=X_val, label=y_val)

In [11]:
def objective(params):
	booster = xgb.train(
		params=params,
		dtrain=train,
		num_boost_round=1000,
		evals=[(valid, "validation")],
		early_stopping_rounds=50,
		verbose_eval=300
	)

	y_pred = booster.predict(valid)
	rmse = root_mean_squared_error(y_val, y_pred)

	with mlflow.start_run():
		mlflow.set_tag("model", "xgboost")
		mlflow.log_params(params)
		mlflow.log_metric("rmse", rmse)

	return {'loss': rmse, 'status': STATUS_OK}

In [12]:
search_space = {
	'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
	'learning_rate': hp.loguniform('learning_rate', -3, 0),
	'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
	'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
	'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
	'objective': 'reg:squarederror',
	'seed': 42
}

In [None]:
best_result = fmin(
	fn=objective,
	space=search_space,
	algo=tpe.suggest,
	max_evals=50,
	trials=Trials()
)

[0]	validation-rmse:7.57164                           
[160]	validation-rmse:6.52561                         
[0]	validation-rmse:9.38049                                                       
  2%|▏         | 1/50 [07:38<5:54:35, 434.20s/trial, best loss: 6.525614356467439]

### Autologging

In [None]:
mlflow.xgboost.autolog()

In [None]:
best_params = {
	'max_depth': 50,
	'learning_rate': 0.6965655202676473,
	'reg_alpha': 0.09384555564756214,
	'reg_lambda': 0.01593638389716341,
	'min_child_weight': 0.9401045048153336,
}

In [None]:
booster = xgb.train(
	params=best_params,
	dtrain=train,
	num_boost_round=1000,
	evals=[(valid, "validation")],
	early_stopping_rounds=50,
	verbose_eval=300
)