In [4]:
'''
Python script to train multiple regression models and ensemble them, avoiding joblib parallel issues.

Usage: python ensemble_script.py
'''

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


def rmse_cv(model, X, y, n_splits=5):
    """
    Calculate cross-validated RMSE using explicit KFold to avoid parallel worker issues.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmses = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmses.append(mean_squared_error(y_val, preds, squared=False))
    return np.mean(rmses)


def main():
    # Cargar datos
    train = pd.read_csv('train_preprocesado.csv')
    val = pd.read_csv('val_preprocesado.csv')
    test = pd.read_csv('test_preprocesado.csv')

    # Combinar train + val para adestrar
    df = pd.concat([train, val], ignore_index=True)
    X = df.drop(['id', 'prezo_euros'], axis=1)
    y = df['prezo_euros']

    # Datos de test
    X_test = test.drop(['id'], axis=1)
    ids = test['id']

    # Definir modelos
    models = {
        'XGBoost': XGBRegressor(
            tree_method='gpu_hist', predictor='gpu_predictor',
            random_state=42
        ),
        'CatBoost': CatBoostRegressor(
            task_type='GPU', verbose=0, random_state=42
        ),
        'RandomForest': RandomForestRegressor(
            n_jobs=1, random_state=42
        ),
        'MLP': MLPRegressor(
            hidden_layer_sizes=(100, 100), max_iter=200,
            early_stopping=True, random_state=42
        )
    }

    # Avaliar modelos individualmente
    print("Evaluating individual models (5-fold CV RMSE):")
    for name, model in models.items():
        rmse = rmse_cv(model, X, y)
        print(f"{name}: {rmse:.2f}")

    # Ensamble con Stacking
    estimators = [(name, model) for name, model in models.items()]
    stacking = StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression(),
        cv=KFold(n_splits=5, shuffle=True, random_state=42)
    )
    rmse_stack = rmse_cv(stacking, X, y)
    print(f"Stacking Ensemble: {rmse_stack:.2f}")

    # Entrenar ensamble en todos los datos e xerar submission
    print("Training ensemble on full data and generating submission...")
    stacking.fit(X, y)
    preds = stacking.predict(X_test)

    submission = pd.DataFrame({'id': ids, 'prezo_euros': preds})
    submission.to_csv('submission.csv', index=False)
    print("Submission saved to submission.csv")


if __name__ == '__main__':
    main()


Evaluating individual models (5-fold CV RMSE):
XGBoost: 53507.30
CatBoost: 50322.77
RandomForest: 55049.79


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 

MLP: 67221.81


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does 