___


# <font color= #8FC3FA> **NYC Taxi Predictions 2025 - Model Experiments 2** </font>
#### <font color= #2E9AFE> `Data Science Project - Homework 5`</font>
- <Strong> Viviana Toledo </Strong>
- <Strong> Fecha: </Strong> 28/10/2025

___

In [2]:
# General Libraries
import pandas as pd
from datetime import datetime

# Databricks Env
from dotenv import load_dotenv
import pickle
import pathlib

# Feature Engineering
from sklearn.feature_extraction import DictVectorizer

# Optimization
import math
import optuna
from optuna.samplers import TPESampler

# Modeling
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# MLFlow
import mlflow
import mlflow.pyfunc
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from mlflow import MlflowClient

# Evaluation
from sklearn.metrics import root_mean_squared_error

# Autolog function
mlflow.sklearn.autolog()

In [3]:
load_dotenv(override=True)  # Carga las variables del archivo .env
EXPERIMENT_NAME = "/Users/viviana.toledo@iteso.mx/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

# <font color= #8FC3FA> **1. Data Loading** </font>

First of all, we'll start by loading the data:

In [3]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

The data is stored in .parquet files. Our function defined above helps us handle these types of data.

In [4]:
df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

For modeling, we create a new variable composed of two variables in the dataset:

In [5]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

# <font color= #8FC3FA> **2. Feature Engineering** </font>

Afterwards, we will proceed to apply feature engineering, which includes dividing features by categorical and numerical types:

In [6]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

In [7]:
# Dictionaries for preprocessing
dv = DictVectorizer()

# Define categorical and numerical variables
categorical = ['PU_DO']
numerical = ['trip_distance']

# Fit DictVectorizer on training data
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Validation
X_val = preprocess(df_val, dv)

And our target variables:

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Upload the datasets to mlflow:

In [9]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

# <font color= #8FC3FA> **3. Modeling** </font>

## <font color= #8FC3FA> &ensp; • **Random Forest** </font>

We're going to perform a hyperparameter search for our Random Forest Regressor Model using Optuna. Firstly, we have to define the target function:

In [26]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------

def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 30, 150),
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 40, 200),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 4, 100),
        "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", math.exp(-3), math.exp(-1), log=True),
        "max_features": trial.suggest_int("max_features", 30, 120),
        "ccp_alpha": trial.suggest_float("ccp_alpha",   math.exp(-4), math.exp(-3), log=True),
        "random_state": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "randomforest")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con el conjunto de validación
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Predicción y métrica en validación
        y_pred = rf.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.sklearn.log_model(
            sk_model = rf,
            name="model",
            input_example=X_val[:5],
            signature=signature,
        )

    # Optuna minimiza el valor retornado
    return rmse

Now, we can execute the search and log the models into MLFlow:

In [27]:
mlflow.sklearn.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="RandomForest Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "randomforest",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------

    # Select parameters
    rf = RandomForestRegressor(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        min_samples_leaf=best_params["min_samples_leaf"],
        min_weight_fraction_leaf=best_params["min_weight_fraction_leaf"],
        max_features=best_params["max_features"],
        ccp_alpha=best_params["ccp_alpha"],
        random_state=42
    )
    # Fit the model
    rf.fit(X_train, y_train)

    # Evaluar y registrar la métrica final en validación
    y_pred = rf.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.sklearn.log_model(
    sk_model=rf,                    # Trained RandomForestRegressor
    name="model",                   # Folder inside MLflow run to store the model
    input_example= input_example,   # First few rows of validation data
    signature=signature,            
)

[I 2025-10-27 14:22:02,396] A new study created in memory with name: no-name-e379721e-8167-441e-8ed8-5b3092b1840c


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:22:14 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:22:16,596] Trial 0 finished with value: 9.064385451624798 and parameters: {'n_estimators': 75, 'max_depth': 96, 'min_samples_split': 157, 'min_samples_leaf': 62, 'min_weight_fraction_leaf': 0.06801937287807802, 'max_features': 44, 'ccp_alpha': 0.01941098011269108}. Best is trial 0 with value: 9.064385451624798.


🏃 View run gregarious-hare-167 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/7ac811e95d7b43f1864c880c0f2bd0bd
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:22:30 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run vaunted-lark-735 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/e3fd60c06f064eec9f1bc92d415bb65c
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792


[I 2025-10-27 14:22:34,169] Trial 1 finished with value: 9.06029456412943 and parameters: {'n_estimators': 134, 'max_depth': 62, 'min_samples_split': 153, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.34639335147622574, 'max_features': 105, 'ccp_alpha': 0.0226485173281456}. Best is trial 1 with value: 9.06029456412943.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:22:47 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:22:49,863] Trial 2 finished with value: 9.045930904269012 and parameters: {'n_estimators': 52, 'max_depth': 21, 'min_samples_split': 88, 'min_samples_leaf': 54, 'min_weight_fraction_leaf': 0.11811341610272008, 'max_features': 56, 'ccp_alpha': 0.03377119342948611}. Best is trial 2 with value: 9.045930904269012.


🏃 View run chill-cod-478 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/de3f430ee62e4eb98950b7a152b3f187
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:23:04 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:23:06,932] Trial 3 finished with value: 9.038064743212965 and parameters: {'n_estimators': 46, 'max_depth': 32, 'min_samples_split': 98, 'min_samples_leaf': 48, 'min_weight_fraction_leaf': 0.23939315532729308, 'max_features': 48, 'ccp_alpha': 0.03063030006982894}. Best is trial 3 with value: 9.038064743212965.


🏃 View run placid-eel-0 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/4340d3f3f4e343e0b99020f1d6201ee1
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:23:21 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:23:23,832] Trial 4 finished with value: 9.044298582129173 and parameters: {'n_estimators': 101, 'max_depth': 8, 'min_samples_split': 137, 'min_samples_leaf': 20, 'min_weight_fraction_leaf': 0.05670477741547259, 'max_features': 116, 'ccp_alpha': 0.04810505725669613}. Best is trial 3 with value: 9.038064743212965.


🏃 View run casual-duck-562 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/8945da67ec164ef6a0741022ed60c8e1
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:23:38 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:23:40,628] Trial 5 finished with value: 9.081505354360715 and parameters: {'n_estimators': 127, 'max_depth': 33, 'min_samples_split': 55, 'min_samples_leaf': 70, 'min_weight_fraction_leaf': 0.12006824223834135, 'max_features': 41, 'ccp_alpha': 0.030052089392406243}. Best is trial 3 with value: 9.038064743212965.


🏃 View run grandiose-carp-187 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/f94d19cee9034a4698a115d68df8f1a8
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:23:53 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:23:56,218] Trial 6 finished with value: 8.924256539537533 and parameters: {'n_estimators': 34, 'max_depth': 92, 'min_samples_split': 81, 'min_samples_leaf': 68, 'min_weight_fraction_leaf': 0.09286784222526208, 'max_features': 77, 'ccp_alpha': 0.031641373693897024}. Best is trial 6 with value: 8.924256539537533.


🏃 View run useful-roo-268 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/8a3a893bf8e94550bded0ad33cda18de
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:24:09 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run trusting-cat-710 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/66e12afee5474db694626bdc34dd9f80
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792


[I 2025-10-27 14:24:13,188] Trial 7 finished with value: 8.986968973033756 and parameters: {'n_estimators': 52, 'max_depth': 98, 'min_samples_split': 164, 'min_samples_leaf': 95, 'min_weight_fraction_leaf': 0.29809432993955604, 'max_features': 84, 'ccp_alpha': 0.04604547586979493}. Best is trial 6 with value: 8.924256539537533.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:24:26 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:24:28,908] Trial 8 finished with value: 9.012455083384186 and parameters: {'n_estimators': 40, 'max_depth': 23, 'min_samples_split': 47, 'min_samples_leaf': 35, 'min_weight_fraction_leaf': 0.10832217175080468, 'max_features': 54, 'ccp_alpha': 0.041950602205837094}. Best is trial 6 with value: 8.924256539537533.


🏃 View run inquisitive-lynx-719 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/3544efdb99c14e3a98a7aa06410e14f8
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:24:41 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:24:44,318] Trial 9 finished with value: 9.063312967428525 and parameters: {'n_estimators': 73, 'max_depth': 31, 'min_samples_split': 127, 'min_samples_leaf': 17, 'min_weight_fraction_leaf': 0.24768288551081558, 'max_features': 36, 'ccp_alpha': 0.049138469238622876}. Best is trial 6 with value: 8.924256539537533.


🏃 View run nebulous-koi-698 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/9eccf9aad50849fb8f51ccc0c2320e8a
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:25:00 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run RandomForest Hyperparameter Optimization (Optuna) at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/ba151bbef11e4745be891e8ace7eec39
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792


## <font color= #8FC3FA> &ensp; • **Gradient Boosting** </font>

Same procedure, we're going to define a hyperparameter search using Optuna for Gradient Boosting's hyperparameters:

In [None]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------

def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "learning_rate": trial.suggest_float("learning_rate", math.exp(-2), math.exp(2), log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 250),
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 30, 150),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 4, 100),
        "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", math.exp(-3), math.exp(-1), log=True),
        "max_features": trial.suggest_int("max_features", 20, 150),
        "alpha": trial.suggest_float("alpha",   math.exp(-4), math.exp(-3), log=True),
        "random_state": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "randomforest")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con el conjunto de validación
        rf = GradientBoostingRegressor(**params)
        rf.fit(X_train, y_train)

        # Predicción y métrica en validación
        y_pred = rf.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.sklearn.log_model(
            sk_model = rf,
            name="model",
            input_example=X_val[:5],
            signature=signature,
        )

    # Optuna minimiza el valor retornado
    return rmse

In [23]:
mlflow.sklearn.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="GradientBoosting Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "gradientboosting",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------

    # Select parameters
    gb = GradientBoostingRegressor(
        learning_rate=best_params["learning_rate"],
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        min_samples_leaf=best_params["min_samples_leaf"],
        min_weight_fraction_leaf=best_params["min_weight_fraction_leaf"],
        max_features=best_params["max_features"],
        alpha=best_params["alpha"],
        random_state=42
    )

    # Fit the model
    gb.fit(X_train, y_train)

    # Evaluar y registrar la métrica final en validación
    y_pred = gb.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.sklearn.log_model(
    sk_model=gb,                    # Trained GradientBoostingRegressor
    name="model",                   # Folder inside MLflow run to store the model
    input_example= input_example,   # First few rows of validation data
    signature=signature,            
)

[I 2025-10-27 14:11:36,758] A new study created in memory with name: no-name-a251830a-8e32-47ef-9525-4b724ce0d6dc


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:11:51 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run judicious-steed-9 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/2963be72564b49118cce5377a098fdea
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792


[I 2025-10-27 14:11:54,159] Trial 0 finished with value: 6.888353263797699 and parameters: {'learning_rate': 0.88047001533197, 'n_estimators': 241, 'max_depth': 75, 'min_samples_split': 102, 'min_samples_leaf': 19, 'min_weight_fraction_leaf': 0.06801609168822177, 'max_features': 27, 'alpha': 0.043550945979794295}. Best is trial 0 with value: 6.888353263797699.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:12:07 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:12:09,639] Trial 1 finished with value: 2.299739626174814e+16 and parameters: {'learning_rate': 2.7334787317118483, 'n_estimators': 192, 'max_depth': 5, 'min_samples_split': 147, 'min_samples_leaf': 84, 'min_weight_fraction_leaf': 0.07612932206693435, 'max_features': 43, 'alpha': 0.022002581531949287}. Best is trial 0 with value: 6.888353263797699.


🏃 View run unleashed-lamb-685 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/df7528e8bcba4c92aeac5c7b12b02c5d
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:12:21 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:12:24,062] Trial 2 finished with value: 6.769214040737485 and parameters: {'learning_rate': 0.6195333254805603, 'n_estimators': 155, 'max_depth': 45, 'min_samples_split': 65, 'min_samples_leaf': 63, 'min_weight_fraction_leaf': 0.06580810455571189, 'max_features': 58, 'alpha': 0.02641988964868964}. Best is trial 2 with value: 6.769214040737485.


🏃 View run intrigued-boar-845 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/76aad694056343149eccc6f3d5f68c83
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:12:50 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:12:52,270] Trial 3 finished with value: 6.555500839641166 and parameters: {'learning_rate': 1.3235928843718132, 'n_estimators': 207, 'max_depth': 23, 'min_samples_split': 92, 'min_samples_leaf': 61, 'min_weight_fraction_leaf': 0.05463398387490681, 'max_features': 99, 'alpha': 0.021720997136040116}. Best is trial 3 with value: 6.555500839641166.


🏃 View run nebulous-ram-486 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/a7d64bad3b764d60aca14ef4c3673550
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:13:05 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:13:07,807] Trial 4 finished with value: 7.113998564404653 and parameters: {'learning_rate': 0.18735650465552267, 'n_estimators': 240, 'max_depth': 97, 'min_samples_split': 127, 'min_samples_leaf': 33, 'min_weight_fraction_leaf': 0.060527602883403635, 'max_features': 109, 'alpha': 0.02844316178759491}. Best is trial 3 with value: 6.555500839641166.


🏃 View run invincible-slug-118 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/547883e7b2f349178081f2125c7a509e
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:13:20 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:13:22,659] Trial 5 finished with value: 7.8904944717239625 and parameters: {'learning_rate': 0.24912292596156269, 'n_estimators': 149, 'max_depth': 7, 'min_samples_split': 140, 'min_samples_leaf': 29, 'min_weight_fraction_leaf': 0.18731652775154203, 'max_features': 60, 'alpha': 0.03080950666040755}. Best is trial 3 with value: 6.555500839641166.


🏃 View run blushing-dove-921 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/0e66de584c58408f90a2d5213424a5b5
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:13:35 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run exultant-worm-631 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/3e75320f4dc644f7b3f514d827486cb1
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792


[I 2025-10-27 14:13:38,196] Trial 6 finished with value: 9.048138062076204 and parameters: {'learning_rate': 2.082463143527972, 'n_estimators': 87, 'max_depth': 98, 'min_samples_split': 123, 'min_samples_leaf': 95, 'min_weight_fraction_leaf': 0.29809432993955604, 'max_features': 98, 'alpha': 0.04604547586979493}. Best is trial 3 with value: 6.555500839641166.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:13:50 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:13:53,147] Trial 7 finished with value: 8.484712081400259 and parameters: {'learning_rate': 0.21065417108333506, 'n_estimators': 89, 'max_depth': 8, 'min_samples_split': 69, 'min_samples_leaf': 41, 'min_weight_fraction_leaf': 0.0856657711379556, 'max_features': 128, 'alpha': 0.02616724939319006}. Best is trial 3 with value: 6.555500839641166.


🏃 View run glamorous-snail-444 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/4b6d4197105b4f81b6ac719a65f8e87a
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:14:06 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:14:08,637] Trial 8 finished with value: 7.145556647867208 and parameters: {'learning_rate': 0.551381985410781, 'n_estimators': 159, 'max_depth': 17, 'min_samples_split': 127, 'min_samples_leaf': 11, 'min_weight_fraction_leaf': 0.35835680503910405, 'max_features': 121, 'alpha': 0.02234205910288227}. Best is trial 3 with value: 6.555500839641166.


🏃 View run polite-calf-868 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/136a78a84eb14185ac510e57e0b91d77
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:14:21 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-27 14:14:24,172] Trial 9 finished with value: 8.04890174188964 and parameters: {'learning_rate': 0.13912403378084792, 'n_estimators': 213, 'max_depth': 72, 'min_samples_split': 118, 'min_samples_leaf': 78, 'min_weight_fraction_leaf': 0.05773390345224298, 'max_features': 66, 'alpha': 0.020565693809507044}. Best is trial 3 with value: 6.555500839641166.


🏃 View run fearless-loon-617 at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/9b7efb419e6e4b6bb728727ce5a2c534
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/27 14:14:40 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run GradientBoosting Hyperparameter Optimization (Optuna) at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792/runs/7bf182f0bf014c6499f50d5787c0570d
🧪 View experiment at: https://dbc-ce601eea-1bde.cloud.databricks.com/ml/experiments/2243674360324792


# <font color= #8FC3FA> **4. Model Evaluation** </font>

For the experiments we previously ran, our objective is to determine a new challenger model to run against our champion model, which was an XGBoost. For this, we're going to use the metric validation-rmse.

In [5]:
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"],
    output_format="list"
)

# Filter for only RandomForest or GradientBoosting models 
challenger_runs = []
for run in runs:
    run_name = run.info.run_name
    if run_name and any(model_type in run_name for model_type in ['RandomForest', 'GradientBoosting']):
        challenger_runs.append(run)

# Get the best run 
if len(challenger_runs) > 0:
    best_run = challenger_runs[0]
    print("Found Challenger Run:")
    print(f"Run ID: {best_run.info.run_id}")
    print(f"Model Type: {best_run.data.params.get('model_type')}")
    print(f"RMSE: {best_run.data.metrics['rmse']}")
    print(f"Params: {best_run.data.params}")
else:
    print("⚠️ No RandomForest or GradientBoosting runs found.")

Found Challenger Run:
Run ID: 8f3b81a764e740ed888c21f24dfaa625
Model Type: None
RMSE: 6.416337650057929
Params: {'alpha': '0.021720997136040116', 'ccp_alpha': '0.0', 'criterion': 'friedman_mse', 'init': 'None', 'learning_rate': '0.8388527762909672', 'loss': 'squared_error', 'max_depth': '23', 'max_features': '99', 'max_leaf_nodes': 'None', 'min_impurity_decrease': '0.0', 'min_samples_leaf': '61', 'min_samples_split': '92', 'min_weight_fraction_leaf': '0.05463398387490681', 'n_estimators': '207', 'n_iter_no_change': 'None', 'objective': 'reg:squarederror', 'random_state': '42', 'seed': '42', 'subsample': '1.0', 'tol': '0.0001', 'validation_fraction': '0.1', 'verbose': '0', 'warm_start': 'False'}


The best model was a GradientBoosting.

# <font color= #8FC3FA> **5. MLFlow Registering** </font>

Now, we have to register the best model to MLFlow:

In [None]:
model_name = "workspace.default.nyc-taxi-model"

run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=f"runs:/{best_run.info.run_id}/model",
    name=model_name
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Created version '5' of model 'workspace.default.nyc-taxi-model'.


In [8]:
client = MlflowClient()

model_version = result.version
new_alias = "Challenger"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

In [9]:
date = datetime.today()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=[], creation_timestamp=1761604475852, current_stage=None, deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description=('The model version 5 was transitioned to Challenger on 2025-10-27 '
 '16:35:51.787841'), last_updated_timestamp=1761604552513, metrics=[<Metric: dataset_digest='', dataset_name='', key='rmse', model_id='m-d073db763a764b59b1c93c337bd31e5a', run_id='8f3b81a764e740ed888c21f24dfaa625', step=0, timestamp=1761595560358, value=6.416337650057929>,
 <Metric: dataset_digest='', dataset_name='', key='training_mean_absolute_error', model_id='m-d073db763a764b59b1c93c337bd31e5a', run_id='8f3b81a764e740ed888c21f24dfaa625', step=0, timestamp=1761595556742, value=4.091565974350377>,
 <Metric: dataset_digest='', dataset_name='', key='training_mean_squared_error', model_id='m-d073db763a764b59b1c93c337bd31e5a',