In [2]:


import os
import numpy as np
import pandas as pd
import joblib
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error


  from .autonotebook import tqdm as notebook_tqdm


In [37]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ————————————————————————————
# 0. Configuración
DATA_PATH             = 'train.csv'
MODEL_DIR             = 'models'
ENCODERS_FILE         = 'label_encoders.pkl'
SCALER_FILE           = 'scaler.pkl'
TEST_SIZE             = 0.1
RANDOM_STATE          = 42
EVAL_METRIC           = 'rmse'
CV_FOLDS              = 5
MAX_ROUNDS            = 400
EARLY_STOPPING_ROUNDS = 100

os.makedirs(MODEL_DIR, exist_ok=True)

# ————————————————————————————
# 1. Carga de datos
df = pd.read_csv(DATA_PATH)

# 2. Separar target y features
y = df['prezo_euros'].copy()
X = df.drop(columns=['prezo_euros']).copy()

# ————————————————————————————
# 3. Eliminación de outliers basados en el target (IQR sobre y)
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1

mask = ~((y < (Q1 - 1.5 * IQR)) | (y > (Q3 + 1.5 * IQR)))
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

# ————————————————————————————
# 4. Definir columnas numéricas y categóricas
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# ————————————————————————————
# 5. Imputación y codificación
# 5.1 Imputar numéricos con mediana
for col in num_cols:
    med = X[col].median()
    X[col] = X[col].fillna(med)

# 5.2 Codificar categóricas con LabelEncoder
label_encoders = {}
for col in cat_cols:
    X[col] = X[col].fillna('Missing').astype(str)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Guardar encoders
enc_path = os.path.join(MODEL_DIR, ENCODERS_FILE)
joblib.dump(label_encoders, enc_path)
print(f"Codificadores guardados en {enc_path}")

# ————————————————————————————
# 6. Normalización de las numéricas
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Guardar scaler
scaler_path = os.path.join(MODEL_DIR, SCALER_FILE)
joblib.dump(scaler, scaler_path)
print(f"Scaler guardado en {scaler_path}")

# ————————————————————————————
# Ahora X e y están listos para train/test split y modelado.


Codificadores guardados en models/label_encoders.pkl
Scaler guardado en models/scaler.pkl


In [33]:

# 4. Preparar DMatrix completo
dtrain_full = xgb.DMatrix(X, label=y)

# 5. Función objetivo para Optuna usando CV
def objective(trial):
    # Hiperparámetros a tunear
    """
    params = {
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'objective': 'reg:squarederror',
        'eval_metric': EVAL_METRIC,
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 100.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 100.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 40)
    }
    
    """
    
    """
    params = {
        'tree_method':'gpu_hist','predictor':'gpu_predictor',
        'objective':'reg:squarederror','eval_metric':EVAL_METRIC,
        'learning_rate': trial.suggest_float('learning_rate',1e-4,1e-1,log=True),
        'max_depth': trial.suggest_int('max_depth',4,12),
        'subsample': trial.suggest_float('subsample',0.5,1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.5,1.0),
        'lambda': trial.suggest_float('lambda',1e-3,10.0,log=True),
        'alpha': trial.suggest_float('alpha',1e-3,10.0,log=True),
        'gamma': trial.suggest_float('gamma',0.0,5.0),
        'min_child_weight': trial.suggest_int('min_child_weight',1,10)
    }
    
    """

    params = {
    'tree_method':        'hist',
    'objective':          'reg:squarederror',
    'eval_metric':        EVAL_METRIC,
    # LR óptimo ~0.037  mantenemos el rango original
    'learning_rate':      trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
    # max_depth óptimo = 5  buscamos entre 3 y 8
    'max_depth':          trial.suggest_int('max_depth', 3, 8),
    # subsample óptimo ~0.85  buscamos en [0.6,0.95]
    'subsample':          trial.suggest_float('subsample', 0.6, 0.95),
    # colsample_bytree óptimo ~0.77  buscamos en [0.6,0.9]
    'colsample_bytree':   trial.suggest_float('colsample_bytree', 0.6, 0.9),
    # lambda (L2) óptimo ~0.58  acotamos a [1e-3,5]
    'lambda':             trial.suggest_float('lambda', 1e-3, 5.0, log=True),
    # alpha (L1) óptimo ~2.43  acotamos a [1e-3,5]
    'alpha':              trial.suggest_float('alpha', 1e-3, 5.0, log=True),
    # gamma óptimo ~2.6  mantenemos [0,5]
    'gamma':              trial.suggest_float('gamma', 0.0, 5.0),
    # min_child_weight óptimo = 33  ampliamos el rango a [1,50]
    'min_child_weight':   trial.suggest_int('min_child_weight', 1, 50),
    }

    # Cross-validation
    cv_results = xgb.cv(
        params,
        dtrain_full,
        num_boost_round=MAX_ROUNDS,
        nfold=CV_FOLDS,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose_eval=False,
        seed=RANDOM_STATE
    )
    # La última rmse en validation
    best_rmse = cv_results['test-rmse-mean'].min()
    return best_rmse


In [None]:

# 6. Lanzar optimización
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=150)
print("Mejores parámetros:", study.best_params)
print(f"Mejor RMSE CV: {study.best_value:.4f}")

# 7. Entrenamiento final con mejores parámetros y early stopping en split
# Dividir para early stopping final
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

final_params = study.best_params.copy()
final_params.update({
    'tree_method': 'hist',
    'objective': 'reg:squarederror',
    'eval_metric': EVAL_METRIC
})
final_model = xgb.train(
    final_params,
    dtrain,
    num_boost_round=4000,
    evals=[(dtrain, 'train'), (dval, 'validation')],
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose_eval=100
)

[I 2025-05-06 11:28:26,450] A new study created in memory with name: no-name-598c6b45-1ea0-409d-8698-3d9789036507
[I 2025-05-06 11:28:33,397] Trial 0 finished with value: 127098.7466627638 and parameters: {'learning_rate': 0.0013292918943162175, 'max_depth': 8, 'subsample': 0.8561978796339917, 'colsample_bytree': 0.779597545259111, 'lambda': 0.003776663327107336, 'alpha': 0.003775887545682684, 'gamma': 0.2904180608409973, 'min_child_weight': 44}. Best is trial 0 with value: 127098.7466627638.
[I 2025-05-06 11:28:42,371] Trial 1 finished with value: 65882.57982121776 and parameters: {'learning_rate': 0.006358358856676255, 'max_depth': 7, 'subsample': 0.6072045730035308, 'colsample_bytree': 0.8909729556485984, 'lambda': 1.1999975480350795, 'alpha': 0.00610149136730271, 'gamma': 0.9091248360355031, 'min_child_weight': 10}. Best is trial 1 with value: 65882.57982121776.
[I 2025-05-06 11:28:45,559] Trial 2 finished with value: 138411.18306387163 and parameters: {'learning_rate': 0.000817949

Mejores parámetros: {'learning_rate': 0.09767912623790692, 'max_depth': 5, 'subsample': 0.943828431394383, 'colsample_bytree': 0.7405201554209732, 'lambda': 0.0020100988356750284, 'alpha': 0.056877810794258424, 'gamma': 0.866309636474001, 'min_child_weight': 3}
Mejor RMSE CV: 40529.2934
[0]	train-rmse:145109.54518	validation-rmse:145684.83792
[100]	train-rmse:35972.90009	validation-rmse:42260.89100
[199]	train-rmse:32579.50853	validation-rmse:41555.01745


In [None]:
final_model = xgb.train(
    final_params,
    dtrain,
    num_boost_round=4000,
    evals=[(dtrain, 'train'), (dval, 'validation')],
    verbose_eval=100
)

[0]	train-rmse:145109.54518	validation-rmse:145684.83792
[100]	train-rmse:35972.90009	validation-rmse:42260.89100
[200]	train-rmse:32564.34226	validation-rmse:41541.47281
[300]	train-rmse:30001.07358	validation-rmse:41419.86761
[400]	train-rmse:27751.69625	validation-rmse:41397.19108
[444]	train-rmse:26930.40264	validation-rmse:41407.48496


In [35]:

# 8. Guardar modelo
model_path = os.path.join(MODEL_DIR, MODEL_FILE)
final_model.save_model(model_path)
print(f"Modelo final guardado en {model_path}")

# 9. Evaluación en validación
y_pred = final_model.predict(dval)
import numpy as np
val_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE validación final: {val_rmse:.2f}")


Modelo final guardado en models/xgb_large_optuna_features.json
RMSE validación final: 41555.02


In [None]:
# Guardar lista de características (orden fijo)
FEATURES = [c for c in df.columns if c != 'prezo_euros']
SUBMIT_FILE = 'submission_xg_normalizado.csv'

if os.path.exists('test.csv'):
    # Leer test.csv usando la primera columna como índice para evitar 'Unnamed'
    df_test = pd.read_csv('test.csv', index_col=0)
    # Asegurar solo columnas entrenadas
    X_test = df_test.reindex(columns=FEATURES).copy()
    # Imputación y codificación iguales al entrenamiento
    for col in num_cols:
        X_test[col].fillna(X[col].median(), inplace=True)
    for col, le in label_encoders.items():
        X_test[col] = le.transform(X_test[col].fillna('Missing').astype(str))
    # Predecir
    dtest = xgb.DMatrix(X_test)
    preds_test = final_model.predict(dtest)
    submission = pd.DataFrame({'id': df_test['id'], 'prezo_euros': preds_test})
    submission.to_csv(os.path.join(MODEL_DIR, SUBMIT_FILE), index=False)
    print(f"Submission guardada en {SUBMIT_FILE}")

Submission guardada en submission_velev.csv
