In [1]:
# Modelado Avanzado para Competición Kaggle - Precios de Viviendas en Galicia
# =============================================================================
# Importación de bibliotecas necesarias
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import os
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

# Configuración para reproducibilidad
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Configuración de visualización
sns.set_palette("viridis")
pd.set_option('display.max_columns', None)

# %%
# =============================================================================
# 1. CARGA DE DATOS PROCESADOS
# =============================================================================
print("Cargando datos procesados...")
if not os.path.exists('train_processed.csv') or not os.path.exists('test_processed.csv'):
    raise FileNotFoundError("No se encontraron los archivos procesados. Ejecute primero el script de preprocesamiento.")

train_data = pd.read_csv('train_processed.csv')
test_data = pd.read_csv('test_processed.csv')

print(f"Dimensiones del conjunto de entrenamiento: {train_data.shape}")
print(f"Dimensiones del conjunto de prueba: {test_data.shape}")

# %%
# =============================================================================
# 2. PREPARACIÓN PARA EL MODELADO
# =============================================================================
print("\nPreparando datos para el modelado...")

# Verificar si tenemos transformación logarítmica de los precios
if 'log_prezo' in train_data.columns:
    print("Usando transformación logarítmica de precios para el modelado...")
    y = train_data['log_prezo']
    use_log = True
else:
    print("Usando precios originales para el modelado...")
    y = train_data['prezo_euros']
    use_log = False

# Excluir ID, target, indicador outlier y variables categóricas originales
exclude = ['id', 'prezo_euros', 'log_prezo', 'is_outlier',
           'tipo_edificacion', 'calidade_materiais', 'cor_favorita_propietario',
           'acceso_transporte_publico', 'orientacion', 'eficiencia_enerxetica']

feature_cols = [c for c in train_data.columns if c not in exclude]
X = train_data[feature_cols]

# Imputar faltantes restantes
if X.isnull().sum().sum() > 0:
    print("ADVERTENCIA: Hay valores faltantes en las características. Imputando con la mediana...")
    X = X.fillna(X.median())

# Dividir en train/validation con un split específico para evaluación final
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
print(f"Tamaño del train: {X_train.shape}, validación: {X_val.shape}")


Cargando datos procesados...
Dimensiones del conjunto de entrenamiento: (18299, 73)
Dimensiones del conjunto de prueba: (10000, 71)

Preparando datos para el modelado...
Usando transformación logarítmica de precios para el modelado...
Tamaño del train: (14639, 63), validación: (3660, 63)


In [2]:

# =============================================================================
# 3. ANÁLISIS DE CARACTERÍSTICAS
# =============================================================================
print("\nAnalizando importancia de características...")

# Utilizamos un RandomForest para estimar importancia inicial de características
feature_analyzer = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
feature_analyzer.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': feature_analyzer.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 características más importantes:")
print(feature_importance.head(15))

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title('Importancia de Características (Random Forest)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Obtener las 20 características más importantes para posible filtrado
top_features = feature_importance.head(20)['feature'].tolist()

# Añadir ingeniería de características polinómicas para las top 5 features
print("\nCreando características polinómicas para las top 5 características...")
poly_features = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
top5_features = feature_importance.head(5)['feature'].tolist()

X_train_poly = X_train.copy()
X_val_poly = X_val.copy()

poly_features_df = pd.DataFrame(
    poly_features.fit_transform(X_train[top5_features]),
    columns=poly_features.get_feature_names_out(top5_features)
)

poly_features_df_val = pd.DataFrame(
    poly_features.transform(X_val[top5_features]),
    columns=poly_features.get_feature_names_out(top5_features)
)

# Eliminar las columnas originales para evitar duplicados
poly_features_df = poly_features_df.drop(columns=top5_features)
poly_features_df_val = poly_features_df_val.drop(columns=top5_features)

# Unir con el conjunto original
X_train_poly = pd.concat([X_train, poly_features_df], axis=1)
X_val_poly = pd.concat([X_val, poly_features_df_val], axis=1)

print(f"Tamaño del conjunto de datos con características polinómicas: {X_train_poly.shape}")

# %%


Analizando importancia de características...

Top 15 características más importantes:
                               feature  importance
14                    superficie_total    0.751029
15             ratio_interior_exterior    0.073838
25                      calidade_valor    0.018210
11               numero_arboles_xardin    0.012165
18                         dist_coruna    0.010506
5                            lonxitude    0.009314
10                indice_criminalidade    0.009043
13           superficie_por_habitacion    0.007520
6                             latitude    0.007069
16                      densidad_banos    0.007037
7   temperatura_media_mes_construccion    0.006974
8                  distancia_centro_km    0.006619
1               superficie_exterior_m2    0.006360
20                       dist_santiago    0.006247
9                  distancia_escola_km    0.005888

Creando características polinómicas para las top 5 características...
Tamaño del conjunto de dat

In [3]:
# =============================================================================
# 4. FUNCIONES AUXILIARES PARA EVALUACIÓN
# =============================================================================
def evaluate_model(model, X_val, y_val, use_log=False):
    """Evalúa un modelo en conjunto de validación y devuelve métricas"""
    y_pred = model.predict(X_val)
    
    if use_log:
        y_pred = np.expm1(y_pred)
        y_true = np.expm1(y_val)
    else:
        y_true = y_val
        
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }

def cross_validate_model(model, X, y, cv=5, use_log=False):
    """Realiza validación cruzada y devuelve métricas promedio"""
    kf = KFold(n_splits=cv, shuffle=True, random_state=RANDOM_SEED)
    maes, rmses, r2s = [], [], []
    is_xgb = isinstance(model, xgb.XGBRegressor)

    for train_idx, val_idx in kf.split(X):
        X_tr = X.iloc[train_idx]
        X_va = X.iloc[val_idx]
        y_tr = y.iloc[train_idx]
        y_va = y.iloc[val_idx]

        if is_xgb:
            X_tr = X_tr.values
            X_va = X_va.values
            y_tr = y_tr.values

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_va)

        if use_log:
            y_pred = np.expm1(y_pred)
            y_va_true = np.expm1(y_va)
        else:
            y_va_true = y_va

        maes.append(mean_absolute_error(y_va_true, y_pred))
        rmses.append(np.sqrt(mean_squared_error(y_va_true, y_pred)))
        r2s.append(r2_score(y_va_true, y_pred))

    return {
        'MAE_CV': np.mean(maes),
        'MAE_STD': np.std(maes),
        'RMSE_CV': np.mean(rmses),
        'RMSE_STD': np.std(rmses),
        'R2_CV': np.mean(r2s),
        'R2_STD': np.std(r2s)
    }

def make_submission(model, test_data, feature_cols, filename='submission.csv', use_log=False, 
                    poly_features=None, top5_features=None):
    """Genera archivo de submisión para Kaggle"""
    X_test = test_data[feature_cols]
    
    # Imputar valores faltantes si existen
    if X_test.isnull().sum().sum() > 0:
        X_test = X_test.fillna(X_test.median())
    
    # Aplicar transformación polinómica si corresponde
    if poly_features is not None and top5_features is not None:
        poly_features_test = pd.DataFrame(
            poly_features.transform(X_test[top5_features]),
            columns=poly_features.get_feature_names_out(top5_features)
        )
        # Eliminar las columnas originales
        poly_features_test = poly_features_test.drop(columns=top5_features)
        # Unir con el conjunto original
        X_test = pd.concat([X_test, poly_features_test], axis=1)
    
    # Predicción
    preds = model.predict(X_test)
    
    # Transformar de vuelta si se usó log
    if use_log:
        preds = np.expm1(preds)
        
    # Crear y guardar submisión
    submission = pd.DataFrame({'id': test_data['id'], 'prezo_euros': preds})
    submission.to_csv(filename, index=False)
    print(f"Submission guardado como {filename}")
    
    # Visualizar distribución de predicciones
    plt.figure(figsize=(10, 6))
    sns.histplot(preds, bins=50, kde=True)
    plt.title('Distribución de Predicciones')
    plt.xlabel('Precio (euros)')
    plt.ylabel('Frecuencia')
    plt.savefig('predicciones_distribucion.png')
    plt.close()
    
    return submission

In [4]:
# %%
# =============================================================================
# 5. ENTRENAMIENTO DE MODELOS BÁSICOS MEJORADOS
# =============================================================================
print("\nEntrenando modelos básicos...")

models = {}

def train_and_record(name, model, X_tr=X_train, X_va=X_val):
    """Entrena, evalúa y registra resultados de un modelo"""
    # si es XGB, pasamos arrays
    if isinstance(model, xgb.XGBRegressor):
        X_tr_fit = X_tr.values
        y_tr_fit = y_train.values
    else:
        X_tr_fit = X_tr
        y_tr_fit = y_train

    t0 = time.time()
    model.fit(X_tr_fit, y_tr_fit)
    t = time.time() - t0

    ev = evaluate_model(model,
                        X_va.values if isinstance(model, xgb.XGBRegressor) else X_va,
                        y_val, use_log)
    cv = cross_validate_model(model, X, y, cv=5, use_log=use_log)

    models[name] = {**ev, **cv, 'train_time': t, 'model': model}
    print(f"{name}: MAE={ev['MAE']:.2f}, RMSE={ev['RMSE']:.2f}, "
          f"R2={ev['R2']:.4f}, time={t:.1f}s")

# Lista de algoritmos básicos a probar
algorithms = [
    ('XGBoost', xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('LightGBM', lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('CatBoost', cb.CatBoostRegressor(iterations=200, learning_rate=0.05, verbose=0, random_state=RANDOM_SEED)),
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge(alpha=1.0, random_state=RANDOM_SEED)),
    ('Lasso', Lasso(alpha=0.001, max_iter=10000, random_state=RANDOM_SEED)),
    ('ElasticNet', ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=10000, random_state=RANDOM_SEED)),
    ('HuberRegressor', HuberRegressor(epsilon=1.35, max_iter=1000)),
    ('RandomForest', RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=RANDOM_SEED)),
    ('ExtraTrees', ExtraTreesRegressor(n_estimators=200, n_jobs=-1, random_state=RANDOM_SEED)),
    ('GradientBoosting', GradientBoostingRegressor(n_estimators=200, random_state=RANDOM_SEED))
    
]

# Entrenar modelos con datos originales
print("\nEntrenando con características originales:")
for n, m in algorithms: 
    train_and_record(n, m)

# Entrenar modelos con datos polinómicos para los mejores algoritmos
print("\nEntrenando modelos con características polinómicas:")
boosting_algorithms = [
    ('XGB_Poly', xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('LGBM_Poly', lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('CatBoost_Poly', cb.CatBoostRegressor(iterations=200, learning_rate=0.05, verbose=0, random_state=RANDOM_SEED))
]

#for n, m in boosting_algorithms:
    #train_and_record(n, m, X_train_poly, X_val_poly)

print("\nResumen inicial:")
init_results = pd.DataFrame({m: {
    'MAE': models[m]['MAE'],
    'RMSE': models[m]['RMSE'],
    'R2': models[m]['R2'],
    'MAE_CV': models[m]['MAE_CV'],
    'R2_CV': models[m]['R2_CV']
} for m in models}).T.sort_values('MAE')

print(init_results)

# Visualizar comparación de modelos
plt.figure(figsize=(12, 6))
sns.barplot(x=init_results.index, y='MAE', data=init_results.reset_index().rename(columns={'index': 'Modelo'}))
plt.title('Comparación de MAE por Modelo')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()

# %%


Entrenando modelos básicos...

Entrenando con características originales:
XGBoost: MAE=31315.25, RMSE=44825.64, R2=0.9246, time=12.6s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4477
[LightGBM] [Info] Number of data points in the train set: 14639, number of used features: 63
[LightGBM] [Info] Start training from score 12.146740
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4477
[LightGBM] [Info] Number of data points in the train set: 14639, number of used features: 63
[LightGBM] [Info] Start training from score 12.146740
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info]

XGBoostError: [17:32:11] ../src/data/data.cc:452: Check failed: this->labels.Size() % this->num_row_ == 0 (14639 vs. 0) : Incorrect size for labels.
Stack trace:
  [bt] (0) /mnt/netapp2/Store_uni/home/usc/ci/avs/personal/aprendizaje/p4/venvP4/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x16b9c9) [0x7fb768f1b9c9]
  [bt] (1) /mnt/netapp2/Store_uni/home/usc/ci/avs/personal/aprendizaje/p4/venvP4/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x177f8a) [0x7fb768f27f8a]
  [bt] (2) /mnt/netapp2/Store_uni/home/usc/ci/avs/personal/aprendizaje/p4/venvP4/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x179d10) [0x7fb768f29d10]
  [bt] (3) /mnt/netapp2/Store_uni/home/usc/ci/avs/personal/aprendizaje/p4/venvP4/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xa4) [0x7fb768e6c754]
  [bt] (4) /mnt/netapp1/Optcesga_FT2_RHEL7/2020/gentoo/22072020/usr/lib64/libffi.so.7(+0x6bdd) [0x7fb7b9f71bdd]
  [bt] (5) /mnt/netapp1/Optcesga_FT2_RHEL7/2020/gentoo/22072020/usr/lib64/libffi.so.7(+0x6149) [0x7fb7b9f71149]
  [bt] (6) /mnt/netapp1/Optcesga_FT2_RHEL7/2020/gentoo/22072020/usr/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2f6) [0x7fb7b8852176]
  [bt] (7) /mnt/netapp1/Optcesga_FT2_RHEL7/2020/gentoo/22072020/usr/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0xa341) [0x7fb7b884b341]
  [bt] (8) /mnt/netapp1/Optcesga_FT2_RHEL7/2020/gentoo/22072020/usr/lib64/libpython3.7m.so.1.0(_PyObject_FastCallKeywords+0xd7) [0x7fb7bc31c307]



In [5]:
print("\nResumen inicial:")
init_results = pd.DataFrame({m: {
    'MAE': models[m]['MAE'],
    'RMSE': models[m]['RMSE'],
    'R2': models[m]['R2'],
    'MAE_CV': models[m]['MAE_CV'],
    'R2_CV': models[m]['R2_CV']
} for m in models}).T.sort_values('MAE')

print(init_results)

# Visualizar comparación de modelos
plt.figure(figsize=(12, 6))
sns.barplot(x=init_results.index, y='MAE', data=init_results.reset_index().rename(columns={'index': 'Modelo'}))
plt.title('Comparación de MAE por Modelo')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()


Resumen inicial:
                           MAE          RMSE        R2        MAE_CV     R2_CV
CatBoost          31024.128159  44614.429490  0.925288  31198.527833  0.926501
LightGBM          31063.139412  44594.832447  0.925353  31177.872447  0.926937
XGBoost           31315.254844  44825.635264  0.924578  31322.882352  0.926336
GradientBoosting  31607.477038  45191.647352  0.923342  31459.520185  0.925520
ExtraTrees        33340.314376  47441.480077  0.915519  33199.166818  0.917942
RandomForest      33548.969020  47597.978540  0.914961  33638.606645  0.915576
HuberRegressor    35916.036264  65209.613107  0.840388  36575.046703  0.779370
ElasticNet        36243.124905  55386.424869  0.884854  36740.238211  0.861187
Lasso             36272.524831  55495.746006  0.884399  36782.050382  0.860836
Ridge             36276.807190  55237.845212  0.885471  36752.244745  0.862013
LinearRegression  36439.646462  55665.831086  0.883689  36857.916454  0.860847


In [6]:
# =============================================================================
# 6. BÚSQUEDA EXHAUSTIVA DE HIPERPARÁMETROS PARA TOP MODELOS
# =============================================================================
# Seleccionar los 5 mejores modelos
top5 = init_results.index[:5].tolist()
print(f"\nSeleccionados para optimización: {top5}")

# Definir grids de búsqueda mucho más extensos
param_grids = {
    'Ridge': {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    },
    'Lasso': {
        'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01]
    },
    'ElasticNet': {
        'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    },
    'ExtraTrees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'subsample': [0.8, 0.9, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2]
    },
    'XGB_Poly': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 70],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5]
    },
    'LGBM_Poly': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 70],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 7],
        'border_count': [32, 64, 128]
    },
    'CatBoost_Poly': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 7]
    }
}

tuned_models = {}

for name in top5:
    print(f"\nOptimizando {name}...")
    base = models[name]['model']
    
    # Determinar si usamos datos polinómicos
    use_poly = name.endswith('_Poly')
    X_tune = X_train_poly if use_poly else X_train
    X_tune_val = X_val_poly if use_poly else X_val
    
    # Usar RandomizedSearchCV en lugar de GridSearchCV para búsqueda más eficiente
    grid = RandomizedSearchCV(
        base.__class__(**{k: v for k, v in base.get_params().items() if k in ['random_state', 'n_jobs', 'verbose']}),
        param_grids.get(name, {}),
        n_iter=25,  # Número de combinaciones a probar
        cv=5,
        scoring='neg_mean_absolute_error',
        verbose=1,
        n_jobs=-1,
        random_state=RANDOM_SEED
    )
    
    grid.fit(X_tune, y_train)
    best = grid.best_estimator_
    
    ev = evaluate_model(best, X_tune_val, y_val, use_log)
    cv = cross_validate_model(best, X, y, cv=5, use_log=use_log)
    
    tuned_models[name + '_tuned'] = {**ev, **cv, 'model': best, 'best_params': grid.best_params_}
    
    print(f"{name} mejores parámetros: {grid.best_params_}")
    print(f"{name} MAE: {ev['MAE']:.4f}, RMSE: {ev['RMSE']:.4f}, R2: {ev['R2']:.4f}")



Seleccionados para optimización: ['CatBoost', 'LightGBM', 'XGBoost', 'GradientBoosting', 'ExtraTrees']

Optimizando CatBoost...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
CatBoost mejores parámetros: {'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 6, 'border_count': 32}
CatBoost MAE: 30719.4807, RMSE: 44111.9044, R2: 0.9270

Optimizando LightGBM...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080929 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4469
[LightGBM] [Info] Number of data points in the train set: 11711, number of used features: 63
[LightGBM] [Info] Start training from score 12.144955
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.950732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4469
[Light

In [None]:
# %%
# =============================================================================
# 7. ENSAMBLADO DE MODELOS
# =============================================================================
print("\nCreando modelos de ensamblado...")

# Obtener los 3 mejores modelos tunados para el ensamblado
tuned_results = pd.DataFrame({m: {
    'MAE': tuned_models[m]['MAE'],
    'RMSE': tuned_models[m]['RMSE'],
    'R2': tuned_models[m]['R2'],
    'MAE_CV': tuned_models[m]['MAE_CV'],
    'R2_CV': tuned_models[m]['R2_CV']
} for m in tuned_models}).T.sort_values('MAE')

print("\nResultados de modelos tunados:")
print(tuned_results)

# Seleccionar los 3 mejores modelos
top3_models = [tuned_models[name]['model'] for name in tuned_results.index[:3]]
top3_names = tuned_results.index[:3].tolist()

# Crear un modelo de votación (promedio de predicciones)
voting_regressor = VotingRegressor(
    estimators=[(name.replace('_tuned', ''), model) for name, model in zip(top3_names, top3_models)],
    weights=[0.4, 0.3, 0.3]  # Dar más peso al mejor modelo
)

# Determinar si alguno de los modelos top usa características polinómicas
use_poly_ensemble = any(name.startswith(('XGB_Poly', 'LGBM_Poly', 'CatBoost_Poly')) for name in top3_names)
X_ensemble = X_train_poly if use_poly_ensemble else X_train
X_val_ensemble = X_val_poly if use_poly_ensemble else X_val

print("\nEntrenando modelo de ensamblado (Voting)...")
voting_regressor.fit(X_ensemble, y_train)
ev_voting = evaluate_model(voting_regressor, X_val_ensemble, y_val, use_log)
cv_voting = cross_validate_model(voting_regressor, X, y, cv=5, use_log=use_log)

models['VotingRegressor'] = {**ev_voting, **cv_voting, 'model': voting_regressor}
print(f"VotingRegressor: MAE={ev_voting['MAE']:.4f}, RMSE={ev_voting['RMSE']:.4f}, R2={ev_voting['R2']:.4f}")

# Crear un modelo de Stacking
base_models = [(name.replace('_tuned', ''), model) for name, model in zip(top3_names, top3_models)]
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0, random_state=RANDOM_SEED)
)

print("\nEntrenando modelo de ensamblado (Stacking)...")
stacking_regressor.fit(X_ensemble, y_train)
ev_stacking = evaluate_model(stacking_regressor, X_val_ensemble, y_val, use_log)
cv_stacking = cross_validate_model(stacking_regressor, X, y, cv=5, use_log=use_log)

models['StackingRegressor'] = {**ev_stacking, **cv_stacking, 'model': stacking_regressor}
print(f"StackingRegressor: MAE={ev_stacking['MAE']:.4f}, RMSE={ev_stacking['RMSE']:.4f}, R2={ev_stacking['R2']:.4f}")


In [None]:
# %%
# =============================================================================
# 8. RESUMEN FINAL Y SELECCIÓN DE MEJOR MODELO
# =============================================================================
# Combinar todos los resultados
all_results = {}
for name, model_info in models.items():
    if name in ['VotingRegressor', 'StackingRegressor'] or name in top5:
        all_results[name] = {
            'MAE': model_info['MAE'],
            'RMSE': model_info['RMSE'],
            'R2': model_info['R2'],
            'MAE_CV': model_info['MAE_CV'],
            'R2_CV': model_info['R2_CV']
        }

for name, model_info in tuned_models.items():
    all_results[name] = {
        'MAE': model_info['MAE'],
        'RMSE': model_info['RMSE'],
        'R2': model_info['R2'],
        'MAE_CV': model_info['MAE_CV'],
        'R2_CV': model_info['R2_CV']
    }

final_results = pd.DataFrame(all_results).T.sort_values('MAE')

print("\nResumen final de todos los modelos:")
print(final_results)

# Guardar resultados
final_results.to_csv('model_results.csv')

best_name = final_results.index[0]
if best_name in tuned_models:
    best_model = tuned_models[best_name]['model']
else:
    best_model = models[best_name]['model']

print(f"\nMejor modelo final: {best_name} con MAE {final_results.loc[best_name, 'MAE']:.4f}")

# Guardar el mejor modelo
joblib.dump(best_model, f"best_model_{best_name}.pkl")
print(f"Mejor modelo guardado como best_model_{best_name}.pkl")

# %%

In [None]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
# Modelado Avanzado para Competición Kaggle - Precios de Viviendas en Galicia
# =============================================================================
# Importación de bibliotecas necesarias
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import os
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

# Configuración para reproducibilidad
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Configuración de visualización
sns.set_palette("viridis")
pd.set_option('display.max_columns', None)

# %%
# =============================================================================
# 1. CARGA DE DATOS PROCESADOS
# =============================================================================
print("Cargando datos procesados...")
if not os.path.exists('train_processed.csv') or not os.path.exists('test_processed.csv'):
    raise FileNotFoundError("No se encontraron los archivos procesados. Ejecute primero el script de preprocesamiento.")

train_data = pd.read_csv('train_processed.csv')
test_data = pd.read_csv('test_processed.csv')

print(f"Dimensiones del conjunto de entrenamiento: {train_data.shape}")
print(f"Dimensiones del conjunto de prueba: {test_data.shape}")

# %%
# =============================================================================
# 2. PREPARACIÓN PARA EL MODELADO
# =============================================================================
print("\nPreparando datos para el modelado...")

# Verificar si tenemos transformación logarítmica de los precios
if 'log_prezo' in train_data.columns:
    print("Usando transformación logarítmica de precios para el modelado...")
    y = train_data['log_prezo']
    use_log = True
else:
    print("Usando precios originales para el modelado...")
    y = train_data['prezo_euros']
    use_log = False

# Excluir ID, target, indicador outlier y variables categóricas originales
exclude = ['id', 'prezo_euros', 'log_prezo', 'is_outlier',
           'tipo_edificacion', 'calidade_materiais', 'cor_favorita_propietario',
           'acceso_transporte_publico', 'orientacion', 'eficiencia_enerxetica']

feature_cols = [c for c in train_data.columns if c not in exclude]
X = train_data[feature_cols]

# Imputar faltantes restantes
if X.isnull().sum().sum() > 0:
    print("ADVERTENCIA: Hay valores faltantes en las características. Imputando con la mediana...")
    X = X.fillna(X.median())

# Dividir en train/validation con un split específico para evaluación final
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
print(f"Tamaño del train: {X_train.shape}, validación: {X_val.shape}")

# %%
# =============================================================================
# 3. ANÁLISIS DE CARACTERÍSTICAS
# =============================================================================
print("\nAnalizando importancia de características...")

# Utilizamos un RandomForest para estimar importancia inicial de características
feature_analyzer = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
feature_analyzer.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': feature_analyzer.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 características más importantes:")
print(feature_importance.head(15))

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title('Importancia de Características (Random Forest)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Obtener las 20 características más importantes para posible filtrado
top_features = feature_importance.head(20)['feature'].tolist()

# Añadir ingeniería de características polinómicas para las top 5 features
print("\nCreando características polinómicas para las top 5 características...")
poly_features = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
top5_features = feature_importance.head(5)['feature'].tolist()

X_train_poly = X_train.copy()
X_val_poly = X_val.copy()

poly_features_df = pd.DataFrame(
    poly_features.fit_transform(X_train[top5_features]),
    columns=poly_features.get_feature_names_out(top5_features)
)

poly_features_df_val = pd.DataFrame(
    poly_features.transform(X_val[top5_features]),
    columns=poly_features.get_feature_names_out(top5_features)
)

# Eliminar las columnas originales para evitar duplicados
poly_features_df = poly_features_df.drop(columns=top5_features)
poly_features_df_val = poly_features_df_val.drop(columns=top5_features)

# Unir con el conjunto original
X_train_poly = pd.concat([X_train, poly_features_df], axis=1)
X_val_poly = pd.concat([X_val, poly_features_df_val], axis=1)

print(f"Tamaño del conjunto de datos con características polinómicas: {X_train_poly.shape}")

# %%
# =============================================================================
# 4. FUNCIONES AUXILIARES PARA EVALUACIÓN
# =============================================================================
def evaluate_model(model, X_val, y_val, use_log=False):
    """Evalúa un modelo en conjunto de validación y devuelve métricas"""
    y_pred = model.predict(X_val)
    
    if use_log:
        y_pred = np.expm1(y_pred)
        y_true = np.expm1(y_val)
    else:
        y_true = y_val
        
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }

def cross_validate_model(model, X, y, cv=5, use_log=False):
    """Realiza validación cruzada y devuelve métricas promedio"""
    kf = KFold(n_splits=cv, shuffle=True, random_state=RANDOM_SEED)
    maes, rmses, r2s = [], [], []
    
    for train_idx, val_idx in kf.split(X):
        X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_va)
        
        if use_log:
            y_pred = np.expm1(y_pred)
            y_va_true = np.expm1(y_va)
        else:
            y_va_true = y_va
            
        maes.append(mean_absolute_error(y_va_true, y_pred))
        rmses.append(np.sqrt(mean_squared_error(y_va_true, y_pred)))
        r2s.append(r2_score(y_va_true, y_pred))
        
    return {
        'MAE_CV': np.mean(maes),
        'MAE_STD': np.std(maes),
        'RMSE_CV': np.mean(rmses),
        'RMSE_STD': np.std(rmses),
        'R2_CV': np.mean(r2s),
        'R2_STD': np.std(r2s)
    }

def make_submission(model, test_data, feature_cols, filename='submission.csv', use_log=False, 
                    poly_features=None, top5_features=None):
    """Genera archivo de submisión para Kaggle"""
    X_test = test_data[feature_cols]
    
    # Imputar valores faltantes si existen
    if X_test.isnull().sum().sum() > 0:
        X_test = X_test.fillna(X_test.median())
    
    # Aplicar transformación polinómica si corresponde
    if poly_features is not None and top5_features is not None:
        poly_features_test = pd.DataFrame(
            poly_features.transform(X_test[top5_features]),
            columns=poly_features.get_feature_names_out(top5_features)
        )
        # Eliminar las columnas originales
        poly_features_test = poly_features_test.drop(columns=top5_features)
        # Unir con el conjunto original
        X_test = pd.concat([X_test, poly_features_test], axis=1)
    
    # Predicción
    preds = model.predict(X_test)
    
    # Transformar de vuelta si se usó log
    if use_log:
        preds = np.expm1(preds)
        
    # Crear y guardar submisión
    submission = pd.DataFrame({'id': test_data['id'], 'prezo_euros': preds})
    submission.to_csv(filename, index=False)
    print(f"Submission guardado como {filename}")
    
    # Visualizar distribución de predicciones
    plt.figure(figsize=(10, 6))
    sns.histplot(preds, bins=50, kde=True)
    plt.title('Distribución de Predicciones')
    plt.xlabel('Precio (euros)')
    plt.ylabel('Frecuencia')
    plt.savefig('predicciones_distribucion.png')
    plt.close()
    
    return submission

# %%
# =============================================================================
# 5. ENTRENAMIENTO DE MODELOS BÁSICOS MEJORADOS
# =============================================================================
print("\nEntrenando modelos básicos...")

models = {}

def train_and_record(name, model, X_tr=X_train, X_va=X_val):
    """Entrena, evalúa y registra resultados de un modelo"""
    t0 = time.time()
    model.fit(X_tr, y_train)
    t = time.time() - t0
    
    ev = evaluate_model(model, X_va, y_val, use_log)
    cv = cross_validate_model(model, X, y, cv=5, use_log=use_log)
    
    models[name] = {**ev, **cv, 'train_time': t, 'model': model}
    
    print(f"{name}: MAE={ev['MAE']:.2f}, RMSE={ev['RMSE']:.2f}, R2={ev['R2']:.4f}, time={t:.1f}s")

# Lista de algoritmos básicos a probar
algorithms = [
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge(alpha=1.0, random_state=RANDOM_SEED)),
    ('Lasso', Lasso(alpha=0.001, max_iter=10000, random_state=RANDOM_SEED)),
    ('ElasticNet', ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=10000, random_state=RANDOM_SEED)),
    ('HuberRegressor', HuberRegressor(epsilon=1.35, max_iter=1000)),
    ('RandomForest', RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=RANDOM_SEED)),
    ('ExtraTrees', ExtraTreesRegressor(n_estimators=200, n_jobs=-1, random_state=RANDOM_SEED)),
    ('GradientBoosting', GradientBoostingRegressor(n_estimators=200, random_state=RANDOM_SEED)),
    ('XGBoost', xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('LightGBM', lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('CatBoost', cb.CatBoostRegressor(iterations=200, learning_rate=0.05, verbose=0, random_state=RANDOM_SEED))
]

# Entrenar modelos con datos originales
print("\nEntrenando con características originales:")
for n, m in algorithms: 
    train_and_record(n, m)

# Entrenar modelos con datos polinómicos para los mejores algoritmos
print("\nEntrenando modelos con características polinómicas:")
boosting_algorithms = [
    ('XGB_Poly', xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('LGBM_Poly', lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05, n_jobs=-1, random_state=RANDOM_SEED)),
    ('CatBoost_Poly', cb.CatBoostRegressor(iterations=200, learning_rate=0.05, verbose=0, random_state=RANDOM_SEED))
]

for n, m in boosting_algorithms:
    train_and_record(n, m, X_train_poly, X_val_poly)

print("\nResumen inicial:")
init_results = pd.DataFrame({m: {
    'MAE': models[m]['MAE'],
    'RMSE': models[m]['RMSE'],
    'R2': models[m]['R2'],
    'MAE_CV': models[m]['MAE_CV'],
    'R2_CV': models[m]['R2_CV']
} for m in models}).T.sort_values('MAE')

print(init_results)

# Visualizar comparación de modelos
plt.figure(figsize=(12, 6))
sns.barplot(x=init_results.index, y='MAE', data=init_results.reset_index().rename(columns={'index': 'Modelo'}))
plt.title('Comparación de MAE por Modelo')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()

# %%
# =============================================================================
# 6. BÚSQUEDA EXHAUSTIVA DE HIPERPARÁMETROS PARA TOP MODELOS
# =============================================================================
# Seleccionar los 5 mejores modelos
top5 = init_results.index[:5].tolist()
print(f"\nSeleccionados para optimización: {top5}")

# Definir grids de búsqueda mucho más extensos
param_grids = {
    'Ridge': {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    },
    'Lasso': {
        'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01]
    },
    'ElasticNet': {
        'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    },
    'ExtraTrees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'subsample': [0.8, 0.9, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2]
    },
    'XGB_Poly': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 70],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5]
    },
    'LGBM_Poly': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 70],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 7],
        'border_count': [32, 64, 128]
    },
    'CatBoost_Poly': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 7]
    }
}

tuned_models = {}

for name in top5:
    print(f"\nOptimizando {name}...")
    base = models[name]['model']
    
    # Determinar si usamos datos polinómicos
    use_poly = name.endswith('_Poly')
    X_tune = X_train_poly if use_poly else X_train
    X_tune_val = X_val_poly if use_poly else X_val
    
    # Usar RandomizedSearchCV en lugar de GridSearchCV para búsqueda más eficiente
    grid = RandomizedSearchCV(
        base.__class__(**{k: v for k, v in base.get_params().items() if k in ['random_state', 'n_jobs', 'verbose']}),
        param_grids.get(name, {}),
        n_iter=25,  # Número de combinaciones a probar
        cv=5,
        scoring='neg_mean_absolute_error',
        verbose=1,
        n_jobs=-1,
        random_state=RANDOM_SEED
    )
    
    grid.fit(X_tune, y_train)
    best = grid.best_estimator_
    
    ev = evaluate_model(best, X_tune_val, y_val, use_log)
    cv = cross_validate_model(best, X, y, cv=5, use_log=use_log)
    
    tuned_models[name + '_tuned'] = {**ev, **cv, 'model': best, 'best_params': grid.best_params_}
    
    print(f"{name} mejores parámetros: {grid.best_params_}")
    print(f"{name} MAE: {ev['MAE']:.4f}, RMSE: {ev['RMSE']:.4f}, R2: {ev['R2']:.4f}")

# %%
# =============================================================================
# 7. ENSAMBLADO DE MODELOS
# =============================================================================
print("\nCreando modelos de ensamblado...")

# Obtener los 3 mejores modelos tunados para el ensamblado
tuned_results = pd.DataFrame({m: {
    'MAE': tuned_models[m]['MAE'],
    'RMSE': tuned_models[m]['RMSE'],
    'R2': tuned_models[m]['R2'],
    'MAE_CV': tuned_models[m]['MAE_CV'],
    'R2_CV': tuned_models[m]['R2_CV']
} for m in tuned_models}).T.sort_values('MAE')

print("\nResultados de modelos tunados:")
print(tuned_results)

# Seleccionar los 3 mejores modelos
top3_models = [tuned_models[name]['model'] for name in tuned_results.index[:3]]
top3_names = tuned_results.index[:3].tolist()

# Crear un modelo de votación (promedio de predicciones)
voting_regressor = VotingRegressor(
    estimators=[(name.replace('_tuned', ''), model) for name, model in zip(top3_names, top3_models)],
    weights=[0.4, 0.3, 0.3]  # Dar más peso al mejor modelo
)

# Determinar si alguno de los modelos top usa características polinómicas
use_poly_ensemble = any(name.startswith(('XGB_Poly', 'LGBM_Poly', 'CatBoost_Poly')) for name in top3_names)
X_ensemble = X_train_poly if use_poly_ensemble else X_train
X_val_ensemble = X_val_poly if use_poly_ensemble else X_val

print("\nEntrenando modelo de ensamblado (Voting)...")
voting_regressor.fit(X_ensemble, y_train)
ev_voting = evaluate_model(voting_regressor, X_val_ensemble, y_val, use_log)
cv_voting = cross_validate_model(voting_regressor, X, y, cv=5, use_log=use_log)

models['VotingRegressor'] = {**ev_voting, **cv_voting, 'model': voting_regressor}
print(f"VotingRegressor: MAE={ev_voting['MAE']:.4f}, RMSE={ev_voting['RMSE']:.4f}, R2={ev_voting['R2']:.4f}")

# Crear un modelo de Stacking
base_models = [(name.replace('_tuned', ''), model) for name, model in zip(top3_names, top3_models)]
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0, random_state=RANDOM_SEED)
)

print("\nEntrenando modelo de ensamblado (Stacking)...")
stacking_regressor.fit(X_ensemble, y_train)
ev_stacking = evaluate_model(stacking_regressor, X_val_ensemble, y_val, use_log)
cv_stacking = cross_validate_model(stacking_regressor, X, y, cv=5, use_log=use_log)

models['StackingRegressor'] = {**ev_stacking, **cv_stacking, 'model': stacking_regressor}
print(f"StackingRegressor: MAE={ev_stacking['MAE']:.4f}, RMSE={ev_stacking['RMSE']:.4f}, R2={ev_stacking['R2']:.4f}")

# %%
# =============================================================================
# 8. RESUMEN FINAL Y SELECCIÓN DE MEJOR MODELO
# =============================================================================
# Combinar todos los resultados
all_results = {}
for name, model_info in models.items():
    if name in ['VotingRegressor', 'StackingRegressor'] or name in top5:
        all_results[name] = {
            'MAE': model_info['MAE'],
            'RMSE': model_info['RMSE'],
            'R2': model_info['R2'],
            'MAE_CV': model_info['MAE_CV'],
            'R2_CV': model_info['R2_CV']
        }

for name, model_info in tuned_models.items():
    all_results[name] = {
        'MAE': model_info['MAE'],
        'RMSE': model_info['RMSE'],
        'R2': model_info['R2'],
        'MAE_CV': model_info['MAE_CV'],
        'R2_CV': model_info['R2_CV']
    }

final_results = pd.DataFrame(all_results).T.sort_values('MAE')

print("\nResumen final de todos los modelos:")
print(final_results)

# Guardar resultados
final_results.to_csv('model_results.csv')

best_name = final_results.index[0]
if best_name in tuned_models:
    best_model = tuned_models[best_name]['model']
else:
    best_model = models[best_name]['model']

print(f"\nMejor modelo final: {best_name} con MAE {final_results.loc[best_name, 'MAE']:.4f}")

# Guardar el mejor modelo
joblib.dump(best_model, f"best_model_{best_name}.pkl")
print(f"Mejor modelo guardado como best_model_{best_name}.pkl")

# %%
# =============================================================================
# 9. GENERAR SUBMISSION FINAL
# =============================================================================
# Determinar si el mejor modelo usa características polinómicas
use_poly_final = best_name.startswith(('XGB_Poly', 'LGBM_Poly', 'CatBoost_Poly', 'VotingRegressor', 'StackingRegressor')) and use_poly_ensemble

# Generar predicciones finales
submission = make_submission(
    best_model, 
    test_data, 
    feature_cols, 
    filename=f'submission_{best_name}.csv', 
    use_log=use_log,
    poly_features=poly_features if use_poly_final else None,
    top5_features=top5_features if use_poly_final else None
)

print("\nPreparación de submission completada. Aquí está una muestra:")
print(submission.head())

# Visualizar características más importantes del mejor modelo
if hasattr(best_model, 'feature_importances_') or (hasattr(best_model, 'estimators_') and hasattr(best_model.estimators_[0], 'feature_importances_')):
    try:
        if hasattr(best_model, 'feature_importances_'):
            importances = best_model.feature_importances_
            feature_names = X_ensemble.columns if use_poly_final else X.columns
        else:
            # Para ensambles, usar el primer estimador
            importances = best_model.estimators_[0].feature_importances_
            feature_names = X_ensemble.columns if use_poly_final else X.columns
        
        feature_imp = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(12, 8))
        sns.barplot(x='importance', y='feature', data=feature_imp.head(20))
        plt.title(f'Importancia de Características ({best_name})')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{best_name}.png')
        plt.close()
        
        print("\nCaracterísticas más importantes del mejor modelo guardadas en 'feature_importance_{best_name}.png'")
    except:
        print("No se pudo generar el gráfico de importancia de características para este modelo.")

print("\nAnalisis y modelado completados exitosamente!")