In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Create directory for results
os.makedirs('models', exist_ok=True)
os.makedirs('results_lightgbm', exist_ok=True)

# Set visualization parameters
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette("viridis")

# Function to load and prepare data
def load_data(file_path):
    df = pd.read_csv(file_path)
    print(f"Valores faltantes en el dataset: {df.isnull().sum().sum()}")
    
    # Identify categorical features
    categorical_features = []
    for col in df.columns:
        if df[col].dtype == 'object' or col in ['tipo_edificacion', 'calidade_materiais', 
                                              'cor_favorita_propietario', 'acceso_transporte_publico',
                                              'orientacion', 'eficiencia_enerxetica'] or 'tipo_' in col or 'color_' in col:
            categorical_features.append(col)
    
    print(f"Características categóricas detectadas: {categorical_features}")
    
    # Separate features and target
    X = df.drop(['prezo_euros', 'id'], axis=1, errors='ignore')
    y = df['prezo_euros']
    
    print(f"Forma del dataset: {df.shape}")
    print(f"Features incluidas: {X.columns.tolist()}")
    
    return X, y, categorical_features

# Function to train LightGBM model
def train_lightgbm_model(X, y, categorical_features=None, random_state=42):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=random_state
    )
    
    print("\n=== Entrenando modelo LightGBM ===")
    
    # Prepare data
    categorical_feature_indices = []
    if categorical_features:
        categorical_feature_indices = [list(X.columns).index(col) for col in categorical_features if col in X.columns]
    
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_feature_indices)
    valid_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_feature_indices, reference=train_data)
    
    # Parameters
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'random_state': random_state
    }
    
    # Grid search for hyperparameters
    param_grid = {
        'num_leaves': [15, 31, 63],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200, 500],
        'min_child_samples': [5, 10, 20],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    # Simplified grid for faster execution
    simple_param_grid = {
        'num_leaves': [31, 63],
        'learning_rate': [0.05, 0.1],
        'n_estimators': [200, 500],
        'min_child_samples': [10,20],
        'subsample': [0.8],
        'colsample_bytree': [1.0]
    }
    
    lgb_model = lgb.LGBMRegressor(objective='regression', random_state=random_state, verbose=-1)
    
    print("Realizando búsqueda de hiperparámetros con GridSearchCV...")
    grid_search = GridSearchCV(
        estimator=lgb_model,
        param_grid=simple_param_grid,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=1
    )
    
    # Convert categorical features list to indices for sklearn API
    if categorical_features:
        cat_indices = [i for i, col in enumerate(X.columns) if col in categorical_features]
    else:
        cat_indices = 'auto'
    
    grid_search.fit(X_train, y_train, eval_metric='mae', categorical_feature=cat_indices)
    
    print(f"Mejores hiperparámetros: {grid_search.best_params_}")
    
    # Train final model with best parameters
    final_params = {**params, **grid_search.best_params_}
    best_model = lgb.train(
        final_params,
        train_data,
        num_boost_round=final_params.get('n_estimators', 500),
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)]
    )
    
    # Evaluate model
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"Métricas del modelo LightGBM:")
    print(f"  MAE: {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R2: {r2:.4f}")
    
    """
    
    # Feature importance
    feature_importance = best_model.feature_importance()
    feature_names = best_model.feature_name()
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title('Top 20 características más importantes (LightGBM)', fontsize=15)
    plt.tight_layout()
    plt.savefig('results_lightgbm/lightgbm_feature_importance.png')
    """
    
    
    print("\nAnálisis de características completado - Ver 'results_lightgbm/lightgbm_feature_importance.png'")
    
    return best_model

# Main execution
if __name__ == "__main__":
    start_time = time.time()
    
    # Load data
    X, y, cat_features = load_data('train_processed.csv')
    
    # Train LightGBM model
    model = train_lightgbm_model(X, y, cat_features)
    
    # Save model
    model.save_model('models_stacking/stacking_lightgbm_model.txt')
    print("\nModelo guardado como 'models/lightgbm_model.txt'")
    
    end_time = time.time()
    print(f"Tiempo total de ejecución: {end_time - start_time:.2f} segundos")

Valores faltantes en el dataset: 0
Características categóricas detectadas: ['tipo_Apartamento', 'tipo_Casa', 'tipo_Chalet adosado', 'color_Amarelo', 'color_Azul', 'color_Branco', 'color_Negro', 'color_Verde', 'color_Vermello', 'tipo_Apartamento.1', 'tipo_Casa.1', 'tipo_Chalet adosado.1', 'color_Amarelo.1', 'color_Azul.1', 'color_Branco.1', 'color_Negro.1', 'color_Verde.1', 'color_Vermello.1']
Forma del dataset: (20000, 47)
Features incluidas: ['superficie_interior_m2', 'superficie_exterior_m2', 'numero_habitacions', 'numero_banos', 'ano_construccion', 'lonxitude', 'latitude', 'temperatura_media_mes_construccion', 'distancia_centro_km', 'distancia_escola_km', 'indice_criminalidade', 'numero_arboles_xardin', 'edad_vivienda', 'superficie_por_habitacion', 'superficie_total', 'ratio_interior_exterior', 'densidad_banos', 'densidad_habitaciones', 'dist_coruna', 'dist_vigo', 'dist_santiago', 'calidad_edad', 'banos_por_habitacion', 'orientacion_valor', 'eficiencia_valor', 'calidade_valor', 'tra