In [2]:
pip install skrub


Collecting skrub
  Downloading skrub-0.5.3-py3-none-any.whl.metadata (5.7 kB)
Downloading skrub-0.5.3-py3-none-any.whl (339 kB)
Installing collected packages: skrub
Successfully installed skrub-0.5.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install feature-engine


Collecting feature-engine
  Using cached feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Using cached feature_engine-1.8.3-py2.py3-none-any.whl (378 kB)
Installing collected packages: feature-engine
Successfully installed feature-engine-1.8.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]
preprocessor = ExtendedTransformation()
filter = SimpleFilter()
preprocessor.fit(X_train, y_train)
X_processed, y_processed = preprocessor.transform(X_train, y_train)
filter.fit(X_processed, y_processed)
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37
X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)
(20974, 6835)
(20974, 4173)
(20974, 3193)
(20974, 1635)
(20974, 4173)
(20974, 3193)
(20974, 1635)


In [3]:
df_test = pd.read_csv("data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3193)
(8989, 1635)


In [12]:
pip install optuna


Note: you may need to restart the kernel to use updated packages.


In [None]:
import optuna
from sklearn.ensemble import GradientBoostingRegressor
import sklearn.model_selection
from sklearn.metrics import mean_pinball_loss, make_scorer

def objective(trial):
    x, y = X_filtered, y_filtered.flatten()

    # Hiperparámetros
    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 5, 32, log=True)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)

    estimator = GradientBoostingRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample,
        loss="quantile",
        alpha=0.5
    )

    # Métrica y evaluación
    score = make_scorer(mean_pinball_loss, alpha=0.5)
    scoring = sklearn.model_selection.cross_val_score(
        estimator, x, y, n_jobs=-1, cv=3, scoring=score
    )

    return scoring.mean()


In [9]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("Mejores hiperparámetros encontrados:")
print(study.best_trial.params)


[I 2025-05-18 19:16:46,611] A new study created in memory with name: no-name-81a2769c-da48-4845-9428-9fa60547ef79
[I 2025-05-18 19:20:08,397] Trial 0 finished with value: 0.273765176607571 and parameters: {'n_estimators': 142, 'max_depth': 7, 'learning_rate': 0.019889634195779663, 'min_samples_leaf': 6, 'subsample': 0.9650245722935478}. Best is trial 0 with value: 0.273765176607571.
[I 2025-05-18 19:20:45,748] Trial 1 finished with value: 0.2876741463752647 and parameters: {'n_estimators': 36, 'max_depth': 8, 'learning_rate': 0.04466793269354801, 'min_samples_leaf': 1, 'subsample': 0.6604823495147313}. Best is trial 0 with value: 0.273765176607571.
[I 2025-05-18 19:21:23,839] Trial 2 finished with value: 0.33147267567427946 and parameters: {'n_estimators': 16, 'max_depth': 30, 'learning_rate': 0.04176035951902224, 'min_samples_leaf': 4, 'subsample': 0.6199841675632588}. Best is trial 0 with value: 0.273765176607571.
[I 2025-05-18 19:22:54,247] Trial 3 finished with value: 0.41933803685

Mejores hiperparámetros encontrados:
{'n_estimators': 142, 'max_depth': 7, 'learning_rate': 0.019889634195779663, 'min_samples_leaf': 6, 'subsample': 0.9650245722935478}


In [10]:
final_model = GradientBoostingRegressor(n_estimators=142, max_depth=7, learning_rate=0.019889634195779663, min_samples_leaf=6, subsample=0.9650245722935478, loss="quantile", alpha=0.5)
final_model.fit(X_filtered, y_filtered.flatten())

In [11]:
y_hat_scaled = final_model.predict(X_test_filtered)

In [12]:
y_hat = preprocessor.inverse_transform(y_hat_scaled.reshape(-1,1))



In [None]:
from sklearn.metrics import mean_pinball_loss

# Predecimos en test
y_test_pred = final_model.predict(X_test_filtered)

# Calculamos la pérdida de Pinball
test_loss = mean_pinball_loss(y_test_filtered, y_test_pred, alpha=0.5)

# Formateamos y mostramos resultados con interpretación
print("\nModel Performance Metric:\n")
print(f"{'Metric':<20} {'Value':>10}")
print("-" * 30)
print(f"{'Pinball Loss':<20} {test_loss:>10.4f}")

print("\nInterpretation:")
print(f"- Pinball Loss (α = {0.5}): measures the error specific to quantile regression.")
print(f"- A lower value indicates better accuracy at estimating the {int(0.5 * 100)}th percentile.")




Model Performance Metric:

Metric                    Value
------------------------------
Pinball Loss             0.2781

Interpretation:
- Pinball Loss (α = 0.5): measures the error specific to quantile regression.
- A lower value indicates better accuracy at estimating the 50th percentile.
