In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

# Create results directory
os.makedirs('results_stacking', exist_ok=True)

# Function to load data
def load_data(train_path, test_path):
    """
    Load training and test data
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # For training data
    X_train = train_df.drop(['prezo_euros', 'id'], axis=1, errors='ignore')
    y_train = train_df['prezo_euros']
    
    # For test data
    if 'Unnamed: 0' in test_df.columns:
        X_test = test_df.drop(['id', 'Unnamed: 0'], axis=1, errors='ignore')
    else:
        X_test = test_df.drop(['id'], axis=1, errors='ignore')
    test_ids = test_df['id']
    
    print(f"Training data shape: {X_train.shape}")
    print(f"Test data shape: {X_test.shape}")
    return X_train, y_train, X_test, test_ids

# Function to identify categorical features
def get_categorical_features(df):
    """
    Identify categorical features in the dataset
    """
    categorical_features = []
    for col in df.columns:
        if (df[col].dtype == 'object' or
            col in ['tipo_edificacion', 'calidade_materiais',
                   'cor_favorita_propietario', 'acceso_transporte_publico',
                   'orientacion', 'eficiencia_enerxetica'] or
            'tipo_' in col or 'color_' in col):
            categorical_features.append(col)
    print(f"Categorical features: {categorical_features}")
    return categorical_features

# Function to load pretrained models
def load_models(model_paths):
    """
    Load pretrained models from specified paths
    """
    models = {}
    
    # Load CatBoost model
    if 'catboost' in model_paths and os.path.exists(model_paths['catboost']):
        print("Loading CatBoost model...")
        models['catboost'] = CatBoostRegressor()
        models['catboost'].load_model(model_paths['catboost'])
    
    # Load XGBoost model
    if 'xgboost' in model_paths and os.path.exists(model_paths['xgboost']):
        print("Loading XGBoost model...")
        models['xgboost'] = XGBRegressor()
        models['xgboost'].load_model(model_paths['xgboost'])
    
    # Load LightGBM model
    if 'lightgbm' in model_paths and os.path.exists(model_paths['lightgbm']):
        print("Loading LightGBM model...")
        models['lightgbm'] = lgb.Booster(model_file=model_paths['lightgbm'])
    
    # Load MLP model (which includes the scaler)
    if 'mlp' in model_paths and os.path.exists(model_paths['mlp']):
        print("Loading MLP model...")
        models['mlp'] = joblib.load(model_paths['mlp'])
    
    print(f"Successfully loaded {len(models)} models")
    return models

# Function to make predictions with loaded models
def predict_with_models(models, X):
    """
    Make predictions using all loaded models
    """
    predictions = {}
    
    if 'catboost' in models:
        print("Predicting with CatBoost...")
        predictions['catboost'] = models['catboost'].predict(X)
    
    if 'xgboost' in models:
        print("Predicting with XGBoost...")
        predictions['xgboost'] = models['xgboost'].predict(X)
    
    if 'lightgbm' in models:
        print("Predicting with LightGBM...")
        predictions['lightgbm'] = models['lightgbm'].predict(X)
    
    if 'mlp' in models:
        print("Predicting with MLP...")
        # Extract model and scaler from the MLP model object
        scaler = models['mlp']['scaler']
        mlp_model = models['mlp']['model']
        # Apply scaling before prediction
        X_scaled = scaler.transform(X)
        predictions['mlp'] = mlp_model.predict(X_scaled)
    
    return predictions

# Function to generate level 1 features
def generate_level1_features(models, X_train, X_test, y_train, categorical_features=None):
    """
    Generate level 1 training features for meta-model using cross-validation
    """
    print("Generating level 1 features for stacking...")
    
    # For training set, use k-fold cross-validation to avoid data leakage
    k = 5
    train_preds_df = pd.DataFrame(index=range(X_train.shape[0]))
    test_preds_all = {}
    
    # Create folds
    n_samples = X_train.shape[0]
    fold_size = n_samples // k
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for fold in range(k):
        print(f"Processing fold {fold+1}/{k}")
        
        # Get indices for this fold
        start_idx = fold * fold_size
        end_idx = (fold + 1) * fold_size if fold < k - 1 else n_samples
        val_indices = indices[start_idx:end_idx]
        train_indices = np.setdiff1d(indices, val_indices)
        
        # Split data for this fold
        X_train_fold = X_train.iloc[train_indices]
        y_train_fold = y_train.iloc[train_indices]
        X_val_fold = X_train.iloc[val_indices]
        
        # Generate predictions for this fold
        fold_preds = pd.DataFrame(index=val_indices)
        test_fold_preds = {}
        
        # Train and predict with CatBoost
        if 'catboost' in models:
            print("Training fold with CatBoost...")
            if categorical_features:
                train_pool = Pool(X_train_fold, y_train_fold, cat_features=categorical_features)
                val_pool = Pool(X_val_fold, cat_features=categorical_features)
                test_pool = Pool(X_test, cat_features=categorical_features)
                
                temp_model = CatBoostRegressor()
                temp_model.set_params(**models['catboost'].get_params())
                temp_model.fit(train_pool, verbose=False)
                fold_preds['catboost'] = temp_model.predict(val_pool)
                test_fold_preds['catboost'] = temp_model.predict(test_pool)
            else:
                temp_model = CatBoostRegressor()
                temp_model.set_params(**models['catboost'].get_params())
                temp_model.fit(X_train_fold, y_train_fold, verbose=False)
                fold_preds['catboost'] = temp_model.predict(X_val_fold)
                test_fold_preds['catboost'] = temp_model.predict(X_test)
        
        # Train and predict with XGBoost
        if 'xgboost' in models:
            print("Training fold with XGBoost...")
            temp_model = XGBRegressor()
            temp_model.set_params(**models['xgboost'].get_params())
            temp_model.fit(X_train_fold, y_train_fold, verbose=False)
            fold_preds['xgboost'] = temp_model.predict(X_val_fold)
            test_fold_preds['xgboost'] = temp_model.predict(X_test)
        
        # Train and predict with LightGBM
        if 'lightgbm' in models:
            print("Training fold with LightGBM...")
            train_lgb = lgb.Dataset(X_train_fold, label=y_train_fold)
            params = models['lightgbm'].params if hasattr(models['lightgbm'], 'params') else {}
            temp_model = lgb.train(params, train_lgb, num_boost_round=100)
            fold_preds['lightgbm'] = temp_model.predict(X_val_fold)
            test_fold_preds['lightgbm'] = temp_model.predict(X_test)
        
        # Train and predict with MLP
        if 'mlp' in models:
            print("Training fold with MLP...")
            # Extract parameters from loaded model
            mlp_params = models['mlp']['model'].get_params()
            
            # Create and fit scaler
            scaler = StandardScaler()
            X_train_fold_scaled = scaler.fit_transform(X_train_fold)
            X_val_fold_scaled = scaler.transform(X_val_fold)
            X_test_scaled = scaler.transform(X_test)
            
            # Create and train MLP model
            temp_model = MLPRegressor(**mlp_params)
            temp_model.fit(X_train_fold_scaled, y_train_fold)
            
            fold_preds['mlp'] = temp_model.predict(X_val_fold_scaled)
            test_fold_preds['mlp'] = temp_model.predict(X_test_scaled)
        
        # Add fold predictions to overall predictions DataFrame
        for model_name in fold_preds.columns:
            train_preds_df.loc[val_indices, model_name] = fold_preds[model_name].values
            
            if model_name not in test_preds_all:
                test_preds_all[model_name] = []
            test_preds_all[model_name].append(test_fold_preds[model_name])
    
    # Create test predictions by averaging fold predictions
    test_preds_df = pd.DataFrame()
    for model_name, preds_list in test_preds_all.items():
        test_preds_df[model_name] = np.mean(preds_list, axis=0)
    
    print(f"Generated level 1 features with shape: {train_preds_df.shape} (train), {test_preds_df.shape} (test)")
    return train_preds_df, test_preds_df

# Function to train meta-model
def train_meta_model(level1_train, y_train, level1_test):
    """
    Train meta-model on level 1 features
    """
    print("Training meta-model...")
    
    # Configure and train meta-model
    meta_model = GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=4,
        random_state=42
    )
    
    # Split level 1 data for validation
    X_train, X_val, y_train_split, y_val = train_test_split(
        level1_train, y_train, test_size=0.2, random_state=42
    )
    
    # Train meta-model
    meta_model.fit(X_train, y_train_split)
    
    # Evaluate on validation set
    val_preds = meta_model.predict(X_val)
    val_mae = mean_absolute_error(y_val, val_preds)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    val_r2 = r2_score(y_val, val_preds)
    
    print(f"Meta-model validation metrics:")
    print(f"  MAE: {val_mae:.2f}")
    print(f"  RMSE: {val_rmse:.2f}")
    print(f"  R²: {val_r2:.4f}")
    
    # Retrain on full dataset
    meta_model.fit(level1_train, y_train)
    
    # Generate predictions for test data
    test_preds = meta_model.predict(level1_test)
    
    return meta_model, test_preds

# Function to create submission file
def create_submission(test_ids, predictions, output_file):
    """
    Create submission file with predictions
    """
    submission_df = pd.DataFrame({
        'id': test_ids,
        'prezo_euros': predictions
    })
    
    submission_df.to_csv(output_file, index=False)
    print(f"Submission saved to {output_file}")
    return submission_df

# Main execution
if __name__ == "__main__":
    start_time = time.time()
    
    # Load data
    train_path = 'train_processed.csv'
    test_path = 'test_processed.csv'
    X_train, y_train, X_test, test_ids = load_data(train_path, test_path)
    
    # Get categorical features
    categorical_features = get_categorical_features(X_train)
    
    # Define paths to pretrained models
    model_paths = {
        'catboost': 'results_initial_hyper/best_model_long.cbm',
        'xgboost': 'models/xgboost_model.json',
        'lightgbm': 'models/lightgbm_model.txt',
        'mlp': 'models/mlp_model.pkl'
    }
    
    # Load models
    models = load_models(model_paths)
    
    # Generate level 1 features
    level1_train, level1_test = generate_level1_features(
        models, X_train, X_test, y_train, categorical_features
    )
    
    # Train meta-model
    meta_model, test_preds = train_meta_model(level1_train, y_train, level1_test)
    
    # Create submission file
    create_submission(test_ids, test_preds, 'submissions_final_stacking.csv')
    
    # Save meta-model
    joblib.dump(meta_model, 'results_stacking/meta_model.pkl')
    print("Meta-model saved to results_stacking/meta_model.pkl")
    
    # End timing
    end_time = time.time()
    print(f"Total execution time: {end_time - start_time:.2f} seconds")