In [1]:
import os
import time
import numpy as np
import pandas as pd
import joblib
import optuna
import xgboost as xgb
import lightgbm as lgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

# Configuración
DEBUG = False    # True para runs rápidos de prueba
DATA_PATH = 'train.csv'
MODEL_DIR = 'models'
XGB_MODEL_FILE = 'xgb_large_optuna.json'
STACK_MODEL_FILE = 'stacked_model.pkl'
ENCODERS_FILE = 'label_encoders.pkl'
SUBMIT_FILE = 'submission_stack.csv'
TEST_SIZE = 0.1
RANDOM_STATE = 42
EVAL_METRIC = 'rmse'
CV_FOLDS = 5
MAX_ROUNDS = 400
EARLY_STOPPING_ROUNDS = 100
N_TRIALS = 100
# DEBUG params
if DEBUG:
    CV_FOLDS = 2
    MAX_ROUNDS = 50
    EARLY_STOPPING_ROUNDS = 10
    N_TRIALS = 3
    print(f"DEBUG MODE: folds={CV_FOLDS}, rounds={MAX_ROUNDS}, stop_rounds={EARLY_STOPPING_ROUNDS}, trials={N_TRIALS}")
os.makedirs(MODEL_DIR, exist_ok=True)

# 1. Carga de datos
df = pd.read_csv(DATA_PATH)
FEATURES = [c for c in df.columns if c != 'prezo_euros']
y = df['prezo_euros']
X = df[FEATURES].copy()

dtypes = X.dtypes
num_cols = dtypes[dtypes.isin([np.float64, np.int64])].index.tolist()
cat_cols = dtypes[dtypes == object].index.tolist()

# 2. Imputación y codificación
for col in num_cols:
    X[col].fillna(X[col].median(), inplace=True)
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = X[col].fillna('Missing').astype(str)
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le
joblib.dump(label_encoders, os.path.join(MODEL_DIR, ENCODERS_FILE))
print(f"Codificadores guardados en {ENCODERS_FILE}")

  from .autonotebook import tqdm as notebook_tqdm


Codificadores guardados en label_encoders.pkl


In [2]:

# 3. Optimización XGBoost con Optuna
print("Iniciando optimización de XGBoost...")
dtrain_full = xgb.DMatrix(X, label=y)
start_time = time.time()
def optuna_callback(study, trial):
    completed = len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])
    elapsed = time.time() - start_time
    avg = elapsed / completed if completed else 0
    rem = avg * (N_TRIALS - completed)
    print(f"Trial {trial.number} en {elapsed:.1f}s (avg {avg:.1f}s), rem {rem:.1f}s")


    

def objective(trial):
    """
    params = {
        'tree_method':'gpu_hist','predictor':'gpu_predictor',
        'objective':'reg:squarederror','eval_metric':EVAL_METRIC,
        'learning_rate': trial.suggest_float('learning_rate',1e-4,1e-1,log=True),
        'max_depth': trial.suggest_int('max_depth',4,12),
        'subsample': trial.suggest_float('subsample',0.5,1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.5,1.0),
        'lambda': trial.suggest_float('lambda',1e-3,10.0,log=True),
        'alpha': trial.suggest_float('alpha',1e-3,10.0,log=True),
        'gamma': trial.suggest_float('gamma',0.0,5.0),
        'min_child_weight': trial.suggest_int('min_child_weight',1,10)
    }
    
    """
    params = {
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'objective': 'reg:squarederror',
        'eval_metric': EVAL_METRIC,
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 100.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 100.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 40)
    }
    cv = xgb.cv(params, dtrain_full, num_boost_round=MAX_ROUNDS,
                nfold=CV_FOLDS, early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                seed=RANDOM_STATE, verbose_eval=False)
    return cv['test-rmse-mean'].min()

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=N_TRIALS, callbacks=[optuna_callback])
print(f"XGB optimize completo en {time.time()-start_time:.1f}s")
print("Mejores parámetros:", study.best_params)
print(f"Mejor RMSE CV: {study.best_value:.2f}")

[I 2025-04-29 17:50:53,682] A new study created in memory with name: no-name-99316236-58d7-42c3-b97e-e35f56583522


Iniciando optimización de XGBoost...


[I 2025-04-29 17:52:38,160] Trial 0 finished with value: 169731.83239288843 and parameters: {'learning_rate': 0.0013292918943162175, 'max_depth': 12, 'subsample': 0.8659969709057025, 'colsample_bytree': 0.7993292420985183, 'lambda': 0.004207988669606638, 'alpha': 0.004207053950287938, 'gamma': 0.2904180608409973, 'min_child_weight': 9}. Best is trial 0 with value: 169731.83239288843.


Trial 0 en 104.5s (avg 104.5s), rem 10343.5s


[I 2025-04-29 17:53:19,523] Trial 1 finished with value: 51029.332600376096 and parameters: {'learning_rate': 0.006358358856676255, 'max_depth': 10, 'subsample': 0.5102922471479012, 'colsample_bytree': 0.9849549260809971, 'lambda': 2.1368329072358767, 'alpha': 0.0070689749506246055, 'gamma': 0.9091248360355031, 'min_child_weight': 2}. Best is trial 1 with value: 51029.332600376096.


Trial 1 en 145.8s (avg 72.9s), rem 7146.3s


[I 2025-04-29 17:53:39,099] Trial 2 finished with value: 206887.3447078906 and parameters: {'learning_rate': 0.0008179499475211679, 'max_depth': 8, 'subsample': 0.7159725093210578, 'colsample_bytree': 0.645614570099021, 'lambda': 0.2801635158716261, 'alpha': 0.003613894271216527, 'gamma': 1.4607232426760908, 'min_child_weight': 4}. Best is trial 1 with value: 51029.332600376096.


Trial 2 en 165.4s (avg 55.1s), rem 5348.5s


[I 2025-04-29 17:54:55,010] Trial 3 finished with value: 122451.14245605687 and parameters: {'learning_rate': 0.0023345864076016252, 'max_depth': 11, 'subsample': 0.5998368910791798, 'colsample_bytree': 0.7571172192068059, 'lambda': 0.23423849847112907, 'alpha': 0.0015339162591163618, 'gamma': 3.0377242595071916, 'min_child_weight': 2}. Best is trial 1 with value: 51029.332600376096.


Trial 3 en 241.3s (avg 60.3s), rem 5791.9s


In [None]:

# 4. Predicciones out-of-fold para stacking
print("Generando OOF predictions para stacking...")
# Detectar GPU LightGBM
"""
try:
    _ = lgb.LGBMRegressor(device='gpu')
    gpu_lgb = True
except:
    gpu_lgb = False
    print("LightGBM GPU no disponible: usando CPU")
"""
gpu_lgb = False

xgb_oof = np.zeros(len(X))
lgb_oof = np.zeros(len(X))
for fold, (tr_idx, va_idx) in enumerate(tqdm(KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE).split(X), total=CV_FOLDS)):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    # XGBoost
    dtr = xgb.DMatrix(X_tr, label=y_tr)
    dva = xgb.DMatrix(X_va, label=y_va)
    xgb_params = study.best_params.copy()
    xgb_params.update({'tree_method':'gpu_hist','predictor':'gpu_predictor','objective':'reg:squarederror','eval_metric':EVAL_METRIC})
    bst = xgb.train(xgb_params, dtr, num_boost_round=MAX_ROUNDS,
                    evals=[(dtr,'train'),(dva,'val')], early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                    verbose_eval=False)
    xgb_oof[va_idx] = bst.predict(dva)
    # LightGBM
    lgb_params = {'objective':'regression','metric':'rmse','n_estimators':bst.best_iteration}
    if gpu_lgb:
        lgb_params['device'] = 'gpu'
    else:
        lgb_params.pop('device', None)
    lgbm = lgb.LGBMRegressor(**lgb_params)
    #lgbm.fit(X_tr, y_tr, eval_set=[(X_va,y_va)], early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)
    lgbm.fit(X_tr, y_tr, eval_set=[(X_va,y_va)])
    lgb_oof[va_idx] = lgbm.predict(X_va)

Generando OOF predictions para stacking...


  0%|          | 0/2 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046009 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2224
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 19
[LightGBM] [Info] Start training from score 224830.855200


 50%|█████     | 1/2 [01:18<01:18, 78.78s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2227
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 19
[LightGBM] [Info] Start training from score 221974.773800


100%|██████████| 2/2 [02:41<00:00, 80.90s/it]


In [None]:

# 5. Stacking meta-model
stack_input = np.vstack([xgb_oof, lgb_oof]).T
meta = Ridge()
meta.fit(stack_input, y)
joblib.dump((study.best_params, meta), os.path.join(MODEL_DIR, STACK_MODEL_FILE))
print(f"Stacking model guardado en {STACK_MODEL_FILE}")


Stacking model guardado en stacked_model.pkl


In [None]:
# 6. Entrenamiento final XGBoost
dtr_full = xgb.DMatrix(X, label=y)
final_params = study.best_params.copy()
final_params.update({'tree_method':'gpu_hist','predictor':'gpu_predictor','objective':'reg:squarederror','eval_metric':EVAL_METRIC})
final_bst = xgb.train(final_params, dtr_full, num_boost_round=int(bst.best_iteration))
final_bst.save_model(os.path.join(MODEL_DIR, XGB_MODEL_FILE))
print(f"Modelo XGB final guardado en {XGB_MODEL_FILE}")

Modelo XGB final guardado en xgb_large_optuna.json


In [None]:
# 7. Submission si test.csv existe
if os.path.exists('test.csv'):
    df_test = pd.read_csv('test.csv', index_col=0)
    X_test = df_test.reindex(columns=FEATURES).copy()
    for c in num_cols:
        X_test[c].fillna(X[c].median(), inplace=True)
    for c,le in label_encoders.items():
        X_test[c] = le.transform(X_test[c].fillna('Missing').astype(str))
    dtest = xgb.DMatrix(X_test)
    xgb_pred = final_bst.predict(dtest)
    # LightGBM final fit
    lgb_final = lgb.LGBMRegressor(**lgb_params)
    lgb_final.fit(X, y)
    lgb_pred = lgb_final.predict(X_test)
    stacked = np.vstack([xgb_pred, lgb_pred]).T
    final_pred = meta.predict(stacked)
    submission = pd.DataFrame({'id': df_test['id'], 'prezo_euros': final_pred})
    submission.to_csv(os.path.join(MODEL_DIR, SUBMIT_FILE), index=False)
    print(f"Submission guardada en {SUBMIT_FILE}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2230
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 19
[LightGBM] [Info] Start training from score 223402.814500
Submission guardada en submission.csv
