In [1]:


import os
import numpy as np
import pandas as pd
import joblib
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Configuración
DATA_PATH = 'train.csv'
MODEL_DIR = 'models'
MODEL_FILE = 'xgb_large_optuna.json'
ENCODERS_FILE = 'label_encoders.pkl'
TEST_SIZE = 0.2
RANDOM_STATE = 42
EVAL_METRIC = 'rmse'
CV_FOLDS = 5
MAX_ROUNDS = 1000
EARLY_STOPPING_ROUNDS = 100

os.makedirs(MODEL_DIR, exist_ok=True)

# 1. Carga de datos
df = pd.read_csv(DATA_PATH)

# 2. Separar target y features
y = df['prezo_euros']
X = df.drop(columns=['prezo_euros'])

# 3. Imputación y codificación
# 3.1 Imputar numéricos con mediana
dtypes = X.dtypes
num_cols = dtypes[dtypes.isin([np.float64, np.int64])].index.tolist()
for col in num_cols:
    X[col].fillna(X[col].median(), inplace=True)
# 3.2 Codificar categóricas con LabelEncoder
cat_cols = dtypes[dtypes == object].index.tolist()
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = X[col].fillna('Missing').astype(str)
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le
joblib.dump(label_encoders, os.path.join(MODEL_DIR, ENCODERS_FILE))
print(f"Codificadores guardados en {ENCODERS_FILE}")


Codificadores guardados en label_encoders.pkl


In [3]:

# 4. Preparar DMatrix completo
dtrain_full = xgb.DMatrix(X, label=y)

# 5. Función objetivo para Optuna usando CV
def objective(trial):
    # Hiperparámetros a tunear
    """
    params = {
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'objective': 'reg:squarederror',
        'eval_metric': EVAL_METRIC,
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 100.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 100.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 40)
    }
    
    """
    
    """
    params = {
        'tree_method':'gpu_hist','predictor':'gpu_predictor',
        'objective':'reg:squarederror','eval_metric':EVAL_METRIC,
        'learning_rate': trial.suggest_float('learning_rate',1e-4,1e-1,log=True),
        'max_depth': trial.suggest_int('max_depth',4,12),
        'subsample': trial.suggest_float('subsample',0.5,1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.5,1.0),
        'lambda': trial.suggest_float('lambda',1e-3,10.0,log=True),
        'alpha': trial.suggest_float('alpha',1e-3,10.0,log=True),
        'gamma': trial.suggest_float('gamma',0.0,5.0),
        'min_child_weight': trial.suggest_int('min_child_weight',1,10)
    }
    
    """

    params = {
    'tree_method':        'gpu_hist',
    'predictor':          'gpu_predictor',
    'objective':          'reg:squarederror',
    'eval_metric':        EVAL_METRIC,
    # LR óptimo ~0.037  mantenemos el rango original
    'learning_rate':      trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
    # max_depth óptimo = 5  buscamos entre 3 y 8
    'max_depth':          trial.suggest_int('max_depth', 3, 8),
    # subsample óptimo ~0.85  buscamos en [0.6,0.95]
    'subsample':          trial.suggest_float('subsample', 0.6, 0.95),
    # colsample_bytree óptimo ~0.77  buscamos en [0.6,0.9]
    'colsample_bytree':   trial.suggest_float('colsample_bytree', 0.6, 0.9),
    # lambda (L2) óptimo ~0.58  acotamos a [1e-3,5]
    'lambda':             trial.suggest_float('lambda', 1e-3, 5.0, log=True),
    # alpha (L1) óptimo ~2.43  acotamos a [1e-3,5]
    'alpha':              trial.suggest_float('alpha', 1e-3, 5.0, log=True),
    # gamma óptimo ~2.6  mantenemos [0,5]
    'gamma':              trial.suggest_float('gamma', 0.0, 5.0),
    # min_child_weight óptimo = 33  ampliamos el rango a [1,50]
    'min_child_weight':   trial.suggest_int('min_child_weight', 1, 50),
    }

    # Cross-validation
    cv_results = xgb.cv(
        params,
        dtrain_full,
        num_boost_round=MAX_ROUNDS,
        nfold=CV_FOLDS,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose_eval=False,
        seed=RANDOM_STATE
    )
    # La última rmse en validation
    best_rmse = cv_results['test-rmse-mean'].min()
    return best_rmse


In [4]:

# 6. Lanzar optimización
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=20)
print("Mejores parámetros:", study.best_params)
print(f"Mejor RMSE CV: {study.best_value:.4f}")

# 7. Entrenamiento final con mejores parámetros y early stopping en split
# Dividir para early stopping final
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

final_params = study.best_params.copy()
final_params.update({
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'objective': 'reg:squarederror',
    'eval_metric': EVAL_METRIC
})
final_model = xgb.train(
    final_params,
    dtrain,
    num_boost_round=int(MAX_ROUNDS),
    evals=[(dtrain, 'train'), (dval, 'validation')],
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose_eval=100
)

[I 2025-05-06 10:57:00,517] A new study created in memory with name: no-name-7829d62e-71d9-4002-8fe7-69797af893b2
[I 2025-05-06 11:00:26,519] Trial 0 finished with value: 89482.05171703149 and parameters: {'learning_rate': 0.0013292918943162175, 'max_depth': 8, 'subsample': 0.8561978796339917, 'colsample_bytree': 0.779597545259111, 'lambda': 0.003776663327107336, 'alpha': 0.003775887545682684, 'gamma': 0.2904180608409973, 'min_child_weight': 44}. Best is trial 0 with value: 89482.05171703149.


In [None]:

# 8. Guardar modelo
model_path = os.path.join(MODEL_DIR, MODEL_FILE)
final_model.save_model(model_path)
print(f"Modelo final guardado en {model_path}")

# 9. Evaluación en validación
y_pred = final_model.predict(dval)
val_rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"RMSE validación final: {val_rmse:.2f}")


Error: Session cannot generate requests

In [None]:
# Guardar lista de características (orden fijo)
FEATURES = [c for c in df.columns if c != 'prezo_euros']
SUBMIT_FILE = 'submission_velev.csv'

if os.path.exists('test.csv'):
    # Leer test.csv usando la primera columna como índice para evitar 'Unnamed'
    df_test = pd.read_csv('test.csv', index_col=0)
    # Asegurar solo columnas entrenadas
    X_test = df_test.reindex(columns=FEATURES).copy()
    # Imputación y codificación iguales al entrenamiento
    for col in num_cols:
        X_test[col].fillna(X[col].median(), inplace=True)
    for col, le in label_encoders.items():
        X_test[col] = le.transform(X_test[col].fillna('Missing').astype(str))
    # Predecir
    dtest = xgb.DMatrix(X_test)
    preds_test = final_model.predict(dtest)
    submission = pd.DataFrame({'id': df_test['id'], 'prezo_euros': preds_test})
    submission.to_csv(os.path.join(MODEL_DIR, SUBMIT_FILE), index=False)
    print(f"Submission guardada en {SUBMIT_FILE}")

Error: Session cannot generate requests