# 1. Importación de librerias

In [1]:
from mapie.metrics import regression_coverage_score, regression_coverage_score_v2, regression_mean_width_score
from mapie.regression import MapieQuantileRegressor

from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

# 2. Preprocesamiento de los datos

In [2]:
df_train = pd.read_csv("data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]
preprocessor = ExtendedTransformation()
filter = SimpleFilter()
preprocessor.fit(X_train, y_train)
X_processed, y_processed = preprocessor.transform(X_train, y_train)
filter.fit(X_processed, y_processed)
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37
X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)
(20974, 6835)
(20974, 4173)
(20974, 3193)
(20974, 1635)
(20974, 4173)
(20974, 3193)
(20974, 1635)


In [3]:
df_test = pd.read_csv("data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3193)
(8989, 1635)


# 3. Optimización de hiperparámetros

Optmizamos con el quantile 0.5, Aunque posteriormente apliquemos los quantiles que consideremos para nuestro intervalo

In [4]:
# optimizamos para calcular el quantil medio con mejor precision.

import optuna
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
import sklearn.model_selection
from sklearn.metrics import mean_pinball_loss, make_scorer
def objective(trial):
    x, y = X_filtered, y_filtered.flatten()

    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 5, 32, log=True)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
    estimator = HistGradientBoostingRegressor(max_iter=n_estimators, 
                                              max_depth=max_depth, 
                                              learning_rate=learning_rate,
                                              loss = "quantile",
                                              quantile = 0.5)

    
    score = make_scorer(mean_pinball_loss, alpha=0.5)
    scoring = sklearn.model_selection.cross_val_score(estimator, x, y, n_jobs=-1, cv=3, scoring=score)

    return scoring.mean()

In [5]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)
print(study.best_trial)

[I 2025-05-06 14:45:26,473] A new study created in memory with name: no-name-5149af11-f526-464a-99d2-e55f10f239db
[I 2025-05-06 14:46:56,312] Trial 0 finished with value: 0.3245116894359663 and parameters: {'n_estimators': 419, 'max_depth': 6, 'learning_rate': 0.0023129811373935314}. Best is trial 0 with value: 0.3245116894359663.
[I 2025-05-06 14:47:21,370] Trial 1 finished with value: 0.34253444172440856 and parameters: {'n_estimators': 95, 'max_depth': 22, 'learning_rate': 0.007154349477115201}. Best is trial 0 with value: 0.3245116894359663.
[I 2025-05-06 14:47:39,652] Trial 2 finished with value: 0.30387445519675976 and parameters: {'n_estimators': 70, 'max_depth': 12, 'learning_rate': 0.01859927609869874}. Best is trial 2 with value: 0.30387445519675976.
[I 2025-05-06 14:47:45,305] Trial 3 finished with value: 0.428915028588845 and parameters: {'n_estimators': 15, 'max_depth': 23, 'learning_rate': 0.001146667896676238}. Best is trial 2 with value: 0.30387445519675976.
[I 2025-05-

FrozenTrial(number=6, state=1, values=[0.26514676173260826], datetime_start=datetime.datetime(2025, 5, 6, 14, 48, 55, 888736), datetime_complete=datetime.datetime(2025, 5, 6, 14, 49, 8, 343209), params={'n_estimators': 48, 'max_depth': 13, 'learning_rate': 0.09550623879579195}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=True, low=10, step=1), 'max_depth': IntDistribution(high=32, log=True, low=5, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None)}, trial_id=6, value=None)


In [6]:
study.best_trial.params

{'n_estimators': 48, 'max_depth': 13, 'learning_rate': 0.09550623879579195}

In [7]:
estimator_params = study.best_trial.params
estimator_params['loss'] = "quantile"
estimator_params['quantile'] = 0.5
estimator_params['max_iter'] = estimator_params['n_estimators']
del estimator_params['n_estimators']
estimator_params

{'max_depth': 13,
 'learning_rate': 0.09550623879579195,
 'loss': 'quantile',
 'quantile': 0.5,
 'max_iter': 48}

# 4. Configuración del estimador base

In [8]:
estimator = HistGradientBoostingRegressor(**estimator_params)

# 5. Configuración del modelo mappie basado en quantile regressor

In [9]:
alpha = 0.2 # 80% de confianza
quantile_params = {"method": "quantile", "cv": "split", "alpha": alpha}

In [10]:
mapie = MapieQuantileRegressor(estimator, **quantile_params)
mapie.fit(
            X_filtered, 
            y_filtered,
            calib_size=0.3,
            random_state=0
        )


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# 6. Predicción de los datos de test

In [11]:
y_pred, y_pis = mapie.predict(X_test_filtered)

INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.


In [12]:
y_pis[:,0]
preprocessor.inverse_transform(y_pis[:,0])



array([[2616183.78196681],
       [3427044.25554149],
       [2469686.86245631],
       ...,
       [2000000.        ],
       [9500000.        ],
       [2800175.99738977]])

In [13]:
# convert to original scale
y_mediam = preprocessor.inverse_transform(y_pred.reshape(-1,1))
y_low = preprocessor.inverse_transform(y_pis[:,0])
y_high = preprocessor.inverse_transform(y_pis[:,1])




# 7. Evaluación de cobertura y longitud media

In [15]:
# evaluamos su cobertura, para ver si realmente en el 80% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low, y_high)
mean_width = regression_mean_width_score(y_low, y_high)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")

regresion coverage: 0.7970853265101792
interval mean width: 15865322.680892356


# 8. Guardar los modelos para su exportación a un entorno de serving/inferencia

In [16]:
import os
import pickle

CHECKPOINTS_DIR = "checkpoints"
# Save the objects
with open(os.path.join(CHECKPOINTS_DIR, "preprocessor.pkl"), "wb") as f:
    pickle.dump(preprocessor, f)

with open(os.path.join(CHECKPOINTS_DIR, "filter.pkl"), "wb") as f:
    pickle.dump(filter, f)

with open(os.path.join(CHECKPOINTS_DIR, "model_with_intervals.pkl"), "wb") as f:
    pickle.dump(mapie, f)

# 9. comprobar que se pueden recuperar los modelos y ejecutar correctamente

In [20]:
# Load the objects
with open(os.path.join(CHECKPOINTS_DIR, "preprocessor.pkl"), "rb") as f:
    my_preprocessor = pickle.load(f)

with open(os.path.join(CHECKPOINTS_DIR, "filter.pkl"), "rb") as f:
    my_filter = pickle.load(f)

with open(os.path.join(CHECKPOINTS_DIR, "model_with_intervals.pkl"), "rb") as f:
    model_w_intervals = pickle.load(f)

In [None]:
# preprocesamos
X_processed, y_processed = my_preprocessor.transform(X_test, y_test)


X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)


In [None]:
# filtramos
X_filtered, y_filtered = my_filter.transform(X_processed, y_processed)

(8989, 4173)
(8989, 3193)
(8989, 1635)


In [None]:
# predecimos
pred, intervals = model_w_intervals.predict(X_filtered)

INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.


In [None]:
# transformamos a la escala adecuada.
y_mediam = my_preprocessor.inverse_transform(y_pred.reshape(-1,1))
y_low = my_preprocessor.inverse_transform(y_pis[:,0])
y_high = my_preprocessor.inverse_transform(y_pis[:,1])



In [27]:
# evaluamos su cobertura, para ver si realmente en el 80% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low, y_high)
mean_width = regression_mean_width_score(y_low, y_high)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")

regresion coverage: 0.7970853265101792
interval mean width: 15865322.680892356
