___


# <font color= #8FC3FA> **NYC Taxi Predictions 2025 - Model Experiments 2** </font>
#### <font color= #2E9AFE> `Data Science Project - Homework 5`</font>
- <Strong> Viviana Toledo </Strong>
- <Strong> Fecha: </Strong> 28/10/2025

___

In [14]:
# General Libraries
import pandas as pd
from datetime import datetime

# Databricks Env
from dotenv import load_dotenv
import pickle
import pathlib

# Feature Engineering
from sklearn.feature_extraction import DictVectorizer

# Optimization
import math
import optuna
from optuna.samplers import TPESampler

# Modeling
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# MLFlow
import mlflow
import mlflow.pyfunc
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from mlflow import MlflowClient

# Evaluation
from sklearn.metrics import root_mean_squared_error

# Autolog function
mlflow.sklearn.autolog()

In [None]:
load_dotenv(override=True)  # Carga las variables del archivo .env
EXPERIMENT_NAME = "/Users/viviana.toledo@iteso.mx/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

# <font color= #8FC3FA> **1. Data Loading** </font>

First of all, we'll start by loading the data:

In [2]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

df_train = read_dataframe("../data/green_tripdata_2025-01.parquet")
df_val = read_dataframe("../data/green_tripdata_2025-02.parquet")

df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()
X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient="records"))
X_val = dv.transform(df_val[categorical + numerical].to_dict(orient="records"))

y_train = df_train["duration"].values
y_val = df_val["duration"].values

The data is stored in .parquet files. Our function defined above helps us handle these types of data.

In [3]:
df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

# <font color= #8FC3FA> **2. Feature Engineering** </font>

Afterwards, we will proceed to apply feature engineering, which includes dividing features by categorical and numerical types:

In [7]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

In [None]:
# Define categorical and numerical values in our data
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# Dictionaries for preprocessing
dv = DictVectorizer()
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

# Train and Evaluation
X_train = dv.fit_transform(train_dicts)
X_val = preprocess(df_val, dv)

# Define targets
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

# <font color= #8FC3FA> **3. Modeling** </font>

## <font color= #8FC3FA> &ensp; • **Random Forest** </font>

We're going to perform a hyperparameter search for our Random Forest Regressor Model using Optuna. Firstly, we have to define the target function:

In [17]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------

def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 4, 100),
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "min_samples_split": trial.suggest_int("min_samples_split", 4, 100),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 4, 100),
        "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", math.exp(-3), math.exp(-1), log=True),
        "max_features": trial.suggest_int("max_features", 4, 100),
        "ccp_alpha": trial.suggest_float("ccp_alpha",   math.exp(-3), math.exp(-3), log=True),
        "random_state": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "randomforest")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con el conjunto de validación
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Predicción y métrica en validación
        y_pred = rf.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.sklearn.log_model(
            sk_model = rf,
            name="model",
            input_example=X_val[:5],
            signature=signature,
            registered_model_name='RandomForestRegressor'
        )

    # Optuna minimiza el valor retornado
    return rmse

Now, we can execute the search and log the models into MLFlow:

In [18]:
mlflow.sklearn.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="RandomForest Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "randomforest",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------

    # Select parameters
    rf = RandomForestRegressor(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        min_samples_leaf=best_params["min_samples_leaf"],
        min_weight_fraction_leaf=best_params["min_weight_fraction_leaf"],
        max_features=best_params["max_features"],
        ccp_alpha=best_params["ccp_alpha"],
        random_state=42
    )
    # Fit the model
    rf.fit(X_train, y_train)

    # Evaluar y registrar la métrica final en validación
    y_pred = rf.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.sklearn.log_model(
    sk_model=rf,                    # Trained RandomForestRegressor
    name="model",          # Folder inside MLflow run to store the model
    input_example= input_example,   # First few rows of validation data
    signature=signature,            
)

[I 2025-10-23 21:12:02,006] A new study created in memory with name: no-name-de9bd26a-fa80-4d80-9aff-f1733544a63f


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '5' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:07,639] Trial 0 finished with value: 8.949023598631308 and parameters: {'n_estimators': 40, 'max_depth': 96, 'min_samples_split': 75, 'min_samples_leaf': 62, 'min_weight_fraction_leaf': 0.06801937287807802, 'max_features': 19, 'ccp_alpha': 0.049787068367863944}. Best is trial 0 with value: 8.949023598631308.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '6' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:11,698] Trial 1 finished with value: 7.747331770221519 and parameters: {'n_estimators': 9, 'max_depth': 88, 'min_samples_split': 62, 'min_samples_leaf': 72, 'min_weight_fraction_leaf': 0.05187952831569513, 'max_features': 98, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '7' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:16,208] Trial 2 finished with value: 8.581682967664413 and parameters: {'n_estimators': 84, 'max_depth': 24, 'min_samples_split': 21, 'min_samples_leaf': 21, 'min_weight_fraction_leaf': 0.0914909229749673, 'max_features': 54, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '8' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:20,519] Trial 3 finished with value: 8.834659524430556 and parameters: {'n_estimators': 45, 'max_depth': 32, 'min_samples_split': 63, 'min_samples_leaf': 17, 'min_weight_fraction_leaf': 0.08930384785648966, 'max_features': 39, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '9' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:24,815] Trial 4 finished with value: 9.105995136049174 and parameters: {'n_estimators': 48, 'max_depth': 80, 'min_samples_split': 23, 'min_samples_leaf': 53, 'min_weight_fraction_leaf': 0.162810087911367, 'max_features': 8, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '10' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:29,839] Trial 5 finished with value: 8.3719030013579 and parameters: {'n_estimators': 62, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 96, 'min_weight_fraction_leaf': 0.34344237703028857, 'max_features': 82, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '11' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:34,727] Trial 6 finished with value: 8.332003742664186 and parameters: {'n_estimators': 33, 'max_depth': 13, 'min_samples_split': 70, 'min_samples_leaf': 46, 'min_weight_fraction_leaf': 0.06355030192906976, 'max_features': 52, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '12' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:39,110] Trial 7 finished with value: 8.16332419093898 and parameters: {'n_estimators': 7, 'max_depth': 92, 'min_samples_split': 29, 'min_samples_leaf': 68, 'min_weight_fraction_leaf': 0.09286784222526208, 'max_features': 54, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '13' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:43,666] Trial 8 finished with value: 8.248072363504829 and parameters: {'n_estimators': 57, 'max_depth': 21, 'min_samples_split': 98, 'min_samples_leaf': 79, 'min_weight_fraction_leaf': 0.3259529879125869, 'max_features': 90, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'RandomForestRegressor' already exists. Creating a new version of this model...
Created version '14' of model 'RandomForestRegressor'.
[I 2025-10-23 21:12:48,263] Trial 9 finished with value: 8.778813684686636 and parameters: {'n_estimators': 61, 'max_depth': 93, 'min_samples_split': 12, 'min_samples_leaf': 23, 'min_weight_fraction_leaf': 0.05450049895708781, 'max_features': 35, 'ccp_alpha': 0.049787068367863944}. Best is trial 1 with value: 7.747331770221519.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

