In [2]:
pip install mapie


Collecting mapie
  Downloading MAPIE-0.9.2-py3-none-any.whl.metadata (12 kB)
Downloading MAPIE-0.9.2-py3-none-any.whl (178 kB)
Installing collected packages: mapie
Successfully installed mapie-0.9.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
from mapie.metrics import regression_coverage_score, regression_coverage_score_v2, regression_mean_width_score
from mapie.regression import MapieQuantileRegressor

from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]
preprocessor = ExtendedTransformation()
filter = SimpleFilter()
preprocessor.fit(X_train, y_train)
X_processed, y_processed = preprocessor.transform(X_train, y_train)
filter.fit(X_processed, y_processed)
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37
X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)
(20974, 6835)
(20974, 4173)
(20974, 3193)
(20974, 1635)
(20974, 4173)
(20974, 3193)
(20974, 1635)


In [3]:
df_test = pd.read_csv("data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3193)
(8989, 1635)


In [4]:
# optimizamos para calcular el quantil medio con mejor precision.

import optuna
from sklearn.ensemble import GradientBoostingRegressor
import sklearn.model_selection
from sklearn.metrics import mean_pinball_loss, make_scorer

def objective(trial):
    x, y = X_filtered, y_filtered.flatten()

    # Hiperparámetros
    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 5, 32, log=True)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)

    # Modelo cuantílico correcto
    estimator = GradientBoostingRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample,
        loss="quantile",
        alpha=0.5
    )

    # Métrica y evaluación
    score = make_scorer(mean_pinball_loss, alpha=0.5)
    scoring = sklearn.model_selection.cross_val_score(
        estimator, x, y, n_jobs=-1, cv=3, scoring=score
    )

    return scoring.mean()

In [5]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)
print(study.best_trial)

[I 2025-05-19 19:02:35,090] A new study created in memory with name: no-name-6a826833-3cc3-4311-9cee-e27517358604
[I 2025-05-19 19:03:35,433] Trial 0 finished with value: 0.3217257827074849 and parameters: {'n_estimators': 44, 'max_depth': 8, 'learning_rate': 0.02050862592909947, 'min_samples_leaf': 2, 'subsample': 0.5499869642921731}. Best is trial 0 with value: 0.3217257827074849.
[I 2025-05-19 19:17:05,539] Trial 1 finished with value: 0.27175920010922844 and parameters: {'n_estimators': 438, 'max_depth': 8, 'learning_rate': 0.006427590805873383, 'min_samples_leaf': 16, 'subsample': 0.9946658742292165}. Best is trial 1 with value: 0.27175920010922844.
[I 2025-05-19 19:18:41,404] Trial 2 finished with value: 0.4191017826304632 and parameters: {'n_estimators': 53, 'max_depth': 12, 'learning_rate': 0.001285324299468155, 'min_samples_leaf': 3, 'subsample': 0.5342712682282242}. Best is trial 1 with value: 0.27175920010922844.
[I 2025-05-19 19:27:32,969] Trial 3 finished with value: 0.250

FrozenTrial(number=3, state=1, values=[0.2508656570412351], datetime_start=datetime.datetime(2025, 5, 19, 19, 18, 41, 404051), datetime_complete=datetime.datetime(2025, 5, 19, 19, 27, 32, 968945), params={'n_estimators': 206, 'max_depth': 19, 'learning_rate': 0.07188197889165099, 'min_samples_leaf': 10, 'subsample': 0.52401130893367}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=True, low=10, step=1), 'max_depth': IntDistribution(high=32, log=True, low=5, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None), 'min_samples_leaf': IntDistribution(high=20, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None)}, trial_id=3, value=None)


In [6]:
study.best_trial.params

{'n_estimators': 206,
 'max_depth': 19,
 'learning_rate': 0.07188197889165099,
 'min_samples_leaf': 10,
 'subsample': 0.52401130893367}

In [11]:
# -------- Modelo 80% --------
alpha_80 = 0.2  # 1 - 0.8 = 0.2
estimator_params_80 = study.best_trial.params.copy()
estimator_params_80["loss"] = "quantile"
estimator_params_80["alpha"] = alpha_80
estimator_params_80["n_estimators"] = estimator_params_80["n_estimators"]
del estimator_params_80["n_estimators"]

In [13]:
# -------- Modelo 90% --------
alpha_90 = 0.1  # 1 - 0.9 = 0.1
estimator_params_90 = study.best_trial.params.copy()
estimator_params_90["loss"] = "quantile"
estimator_params_90["alpha"] = alpha_90
estimator_params_90["n_estimators"] = estimator_params_90["n_estimators"]
del estimator_params_90["n_estimators"]

In [15]:
# -------- Modelo 99% --------
alpha_99 = 0.01  # 1 - 0.99 = 0.01
estimator_params_99 = study.best_trial.params.copy()
estimator_params_99["loss"] = "quantile"
estimator_params_99["alpha"] = alpha_99
estimator_params_99["n_estimators"] = estimator_params_99["n_estimators"]
del estimator_params_99["n_estimators"]

In [12]:
# Creamos los modelos
estimator_80 = GradientBoostingRegressor(**estimator_params_80)

In [14]:
estimator_90 = GradientBoostingRegressor(**estimator_params_90)

In [16]:
estimator_99 = GradientBoostingRegressor(**estimator_params_99)

In [17]:
mapie_80 = MapieQuantileRegressor(estimator_80, method="quantile", cv="split", alpha=alpha_80)

In [18]:
mapie_80.fit(X_filtered, y_filtered, calib_size=0.3, random_state=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
mapie_90 = MapieQuantileRegressor(estimator_90, method="quantile", cv="split", alpha=alpha_90)

In [21]:
mapie_90.fit(X_filtered, y_filtered, calib_size=0.2, random_state=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [22]:
mapie_99 = MapieQuantileRegressor(estimator_99, method="quantile", cv="split", alpha=alpha_99)

In [23]:
mapie_99.fit(X_filtered, y_filtered, calib_size=0.2, random_state=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [25]:
# 6. Predicción de los datos de test
# Predecimos con el modelo del 80%
y_pred_80, y_pis_80 = mapie_80.predict(X_test_filtered)

# Volvemos a la escala original
y_median_80 = preprocessor.inverse_transform(y_pred_80.reshape(-1, 1))
y_low_80 = preprocessor.inverse_transform(y_pis_80[:, 0])
y_high_80 = preprocessor.inverse_transform(y_pis_80[:, 1])

INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.


In [26]:
# Predecimos con el modelo del 90%
y_pred_90, y_pis_90 = mapie_90.predict(X_test_filtered)

# Volvemos a la escala original
y_median_90 = preprocessor.inverse_transform(y_pred_90.reshape(-1, 1))
y_low_90 = preprocessor.inverse_transform(y_pis_90[:, 0])
y_high_90 = preprocessor.inverse_transform(y_pis_90[:, 1])

INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.


In [27]:
# Predecimos con el modelo del 99%
y_pred_99, y_pis_99 = mapie_99.predict(X_test_filtered)

# Volvemos a la escala original
y_median_99 = preprocessor.inverse_transform(y_pred_99.reshape(-1, 1))
y_low_99 = preprocessor.inverse_transform(y_pis_99[:, 0])
y_high_99 = preprocessor.inverse_transform(y_pis_99[:, 1])

INFO:root:The predictions are ill-sorted.


In [28]:
# 7. Evaluación de cobertura y longitud media
coverage_80 = regression_coverage_score(y_test, y_low_80, y_high_80)
mean_width_80 = regression_mean_width_score(y_low_80, y_high_80)

print("Evaluación para el modelo del 80%:")
print(f"Regresión coverage: {coverage_80:.2%}")
print(f"Interval mean width: {mean_width_80:.2f}")


Evaluación para el modelo del 80%:
Regresión coverage: 79.37%
Interval mean width: 26666149.70


In [29]:
coverage_90 = regression_coverage_score(y_test, y_low_90, y_high_90)
mean_width_90 = regression_mean_width_score(y_low_90, y_high_90)

print("Evaluación para el modelo del 90%:")
print(f"Regresión coverage: {coverage_90:.2%}")
print(f"Interval mean width: {mean_width_90:.2f}")


Evaluación para el modelo del 90%:
Regresión coverage: 89.61%
Interval mean width: 50252879.10


In [31]:
coverage_99 = regression_coverage_score(y_test, y_low_99, y_high_99)
mean_width_99 = regression_mean_width_score(y_low_99, y_high_99)

print("Evaluación para el modelo del 99%:")
print(f"Regresión coverage: {coverage_99:.2%}")
print(f"Interval mean width: {mean_width_99:.2f}")


Evaluación para el modelo del 99%:
Regresión coverage: 99.00%
Interval mean width: 431913794.02


In [33]:
# 8. Guardar los modelos para su exportación a un entorno de serving/inferencia
import os
import pickle

CHECKPOINTS_DIR = "checkpoints"
os.makedirs(CHECKPOINTS_DIR, exist_ok=True)

# Guardamos el preprocesador y el filtro (compartidos)
with open(os.path.join(CHECKPOINTS_DIR, "preprocessor.pkl"), "wb") as f:
    pickle.dump(preprocessor, f)

with open(os.path.join(CHECKPOINTS_DIR, "filter.pkl"), "wb") as f:
    pickle.dump(filter, f)

# Guardamos cada modelo MAPIE por separado
with open(os.path.join(CHECKPOINTS_DIR, "model_mapie_80.pkl"), "wb") as f:
    pickle.dump(mapie_80, f)

with open(os.path.join(CHECKPOINTS_DIR, "model_mapie_90.pkl"), "wb") as f:
    pickle.dump(mapie_90, f)

with open(os.path.join(CHECKPOINTS_DIR, "model_mapie_99.pkl"), "wb") as f:
    pickle.dump(mapie_99, f)
