# 1. Importación de librerias

In [2]:
from mapie.metrics import regression_coverage_score, regression_coverage_score_v2, regression_mean_width_score
from mapie.regression import MapieQuantileRegressor

from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

# 2. Preprocesamiento de los datos

In [None]:
df_train = pd.read_csv("data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]
preprocessor = ExtendedTransformation()
filter = SimpleFilter()
preprocessor.fit(X_train, y_train)
X_processed, y_processed = preprocessor.transform(X_train, y_train)
filter.fit(X_processed, y_processed)
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37
X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)
(20974, 6835)
(20974, 4173)
(20974, 3193)


In [52]:
df_test = pd.read_csv("data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3193)
(8989, 1635)


# 3. Optimización de hiperparámetros

Optmizamos con el quantile 0.5, Aunque posteriormente apliquemos los quantiles que consideremos para nuestro intervalo

In [53]:
# optimizamos para calcular el quantil medio con mejor precision.

import optuna
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
import sklearn.model_selection
from sklearn.metrics import mean_pinball_loss, make_scorer
def objective(trial):
    x, y = X_filtered, y_filtered.flatten()

    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 5, 32, log=True)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
    estimator = HistGradientBoostingRegressor(max_iter=n_estimators, 
                                              max_depth=max_depth, 
                                              learning_rate=learning_rate,
                                              loss = "quantile",
                                              quantile = 0.5)

    
    score = make_scorer(mean_pinball_loss, alpha=0.5)
    scoring = sklearn.model_selection.cross_val_score(estimator, x, y, n_jobs=-1, cv=3, scoring=score)

    return scoring.mean()

In [54]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)
print(study.best_trial)

[I 2025-05-06 11:58:58,628] A new study created in memory with name: no-name-345364c9-57bc-448f-ab5b-32a641028910
[I 2025-05-06 11:59:11,258] Trial 0 finished with value: 0.3949820423921732 and parameters: {'n_estimators': 47, 'max_depth': 6, 'learning_rate': 0.0049573256032664735}. Best is trial 0 with value: 0.3949820423921732.
[I 2025-05-06 11:59:21,016] Trial 1 finished with value: 0.4251438011918993 and parameters: {'n_estimators': 30, 'max_depth': 31, 'learning_rate': 0.0014770208427203274}. Best is trial 0 with value: 0.3949820423921732.
[I 2025-05-06 12:00:57,486] Trial 2 finished with value: 0.32247568073489463 and parameters: {'n_estimators': 409, 'max_depth': 8, 'learning_rate': 0.002376759916119798}. Best is trial 2 with value: 0.32247568073489463.
[I 2025-05-06 12:01:12,641] Trial 3 finished with value: 0.4213918840574153 and parameters: {'n_estimators': 68, 'max_depth': 5, 'learning_rate': 0.0010445072241068907}. Best is trial 2 with value: 0.32247568073489463.
[I 2025-05

FrozenTrial(number=6, state=1, values=[0.2542439119432756], datetime_start=datetime.datetime(2025, 5, 6, 12, 1, 57, 41068), datetime_complete=datetime.datetime(2025, 5, 6, 12, 3, 18, 225908), params={'n_estimators': 449, 'max_depth': 13, 'learning_rate': 0.043673940940126735}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=True, low=10, step=1), 'max_depth': IntDistribution(high=32, log=True, low=5, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None)}, trial_id=6, value=None)


In [55]:
study.best_trial.params

{'n_estimators': 449, 'max_depth': 13, 'learning_rate': 0.043673940940126735}

In [56]:
estimator_params = study.best_trial.params
estimator_params['loss'] = "quantile"
estimator_params['quantile'] = 0.5
estimator_params['max_iter'] = estimator_params['n_estimators']
del estimator_params['n_estimators']
estimator_params

{'max_depth': 13,
 'learning_rate': 0.043673940940126735,
 'loss': 'quantile',
 'quantile': 0.5,
 'max_iter': 449}

# 4. Configuración del estimador base

In [57]:
estimator = HistGradientBoostingRegressor(**estimator_params)

# 5. Configuración del modelo mappie basado en quantile regressor

In [58]:
alpha = 0.2 # 80% de confianza
quantile_params = {"method": "quantile", "cv": "split", "alpha": alpha}

In [59]:
mapie = MapieQuantileRegressor(estimator, **quantile_params)
mapie.fit(
            X_filtered, 
            y_filtered,
            calib_size=0.3,
            random_state=0
        )


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# 6. Predicción de los datos de test

In [60]:
y_pred, y_pis = mapie.predict(X_test_filtered)

INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.


In [61]:
y_pis[:,0]
preprocessor.inverse_transform(y_pis[:,0])



array([[ 2500000.        ],
       [ 3147154.24520275],
       [ 2600000.        ],
       ...,
       [ 2000000.        ],
       [10500000.        ],
       [ 2727422.17511592]])

In [62]:
# convert to original scale
y_mediam = preprocessor.inverse_transform(y_pred.reshape(-1,1))
y_low = preprocessor.inverse_transform(y_pis[:,0])
y_high = preprocessor.inverse_transform(y_pis[:,1])




# 7. Evaluación de cobertura y longitud media

In [63]:
# evaluamos su cobertura, para ver si realmente en el 80% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low, y_high)
mean_width = regression_mean_width_score(y_low, y_high)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")

regresion coverage: 0.7915229725219713
interval mean width: 18163237.131236825
