In [3]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import ElasticNet
!pip install catboost --break-system-packages
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler


import pandas as pd
import numpy as np
import os
import joblib
import time
!pip install torch --break-system-packages
import torch
from torch import nn
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_predict
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
import warnings


# Create results directory
os.makedirs('results_stacking', exist_ok=True)

# Function to load data
def load_data(train_path, test_path):
    """
    Load training and test data
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # For training data
    X_train = train_df.drop(['prezo_euros', 'id'], axis=1, errors='ignore')
    y_train = train_df['prezo_euros']
    
    # For test data
    if 'Unnamed: 0' in test_df.columns:
        X_test = test_df.drop(['id', 'Unnamed: 0'], axis=1, errors='ignore')
    else:
        X_test = test_df.drop(['id'], axis=1, errors='ignore')
    test_ids = test_df['id']
    
    print(f"Training data shape: {X_train.shape}")
    print(f"Test data shape: {X_test.shape}")
    return X_train, y_train, X_test, test_ids

# Function to identify categorical features
def get_categorical_features(df):
    """
    Identify categorical features in the dataset
    """
    categorical_features = []
    for col in df.columns:
        if (df[col].dtype == 'object' or
            col in ['tipo_edificacion', 'calidade_materiais',
                   'cor_favorita_propietario', 'acceso_transporte_publico',
                   'orientacion', 'eficiencia_enerxetica'] or
            'tipo_' in col or 'color_' in col):
            categorical_features.append(col)
    print(f"Categorical features: {categorical_features}")
    return categorical_features

# Function to load pretrained models
def load_models(model_paths):
    """
    Load pretrained models from specified paths
    """
    models = {}
    
    # Load CatBoost model
    if 'catboost' in model_paths and os.path.exists(model_paths['catboost']):
        print("Loading CatBoost model...")
        models['catboost'] = CatBoostRegressor()
        models['catboost'].load_model(model_paths['catboost'])
    
    # Load XGBoost model
    if 'xgboost' in model_paths and os.path.exists(model_paths['xgboost']):
        print("Loading XGBoost model...")
        models['xgboost'] = XGBRegressor()
        models['xgboost'].load_model(model_paths['xgboost'])
    
    # Load LightGBM model
    if 'lightgbm' in model_paths and os.path.exists(model_paths['lightgbm']):
        print("Loading LightGBM model...")
        models['lightgbm'] = lgb.Booster(model_file=model_paths['lightgbm'])
    
    # Load MLP model (which includes the scaler)
    if 'mlp' in model_paths and os.path.exists(model_paths['mlp']):
        print("Loading MLP model...")
        models['mlp'] = joblib.load(model_paths['mlp'])
    
    print(f"Successfully loaded {len(models)} models")
    return models

# Function to make predictions with loaded models
def predict_with_models(models, X):
    """
    Make predictions using all loaded models
    """
    predictions = {}
    
    if 'catboost' in models:
        print("Predicting with CatBoost...")
        predictions['catboost'] = models['catboost'].predict(X)
    
    if 'xgboost' in models:
        print("Predicting with XGBoost...")
        predictions['xgboost'] = models['xgboost'].predict(X)
    
    if 'lightgbm' in models:
        print("Predicting with LightGBM...")
        predictions['lightgbm'] = models['lightgbm'].predict(X)
    
    if 'mlp' in models:
        print("Predicting with MLP...")
        # Extract model and scaler from the MLP model object
        scaler = models['mlp']['scaler']
        mlp_model = models['mlp']['model']
        # Apply scaling before prediction
        X_scaled = scaler.transform(X)
        predictions['mlp'] = mlp_model.predict(X_scaled)
    
    return predictions

# Function to generate level 1 features
def generate_level1_features(models, X_train, X_test, y_train, categorical_features=None):
    """
    Generate level 1 training features for meta-model using cross-validation
    """
    print("Generating level 1 features for stacking...")
    
    # For training set, use k-fold cross-validation to avoid data leakage
    k = 5
    train_preds_df = pd.DataFrame(index=range(X_train.shape[0]))
    test_preds_all = {}
    
    # Create folds
    n_samples = X_train.shape[0]
    fold_size = n_samples // k
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for fold in range(k):
        print(f"Processing fold {fold+1}/{k}")
        
        # Get indices for this fold
        start_idx = fold * fold_size
        end_idx = (fold + 1) * fold_size if fold < k - 1 else n_samples
        val_indices = indices[start_idx:end_idx]
        train_indices = np.setdiff1d(indices, val_indices)
        
        # Split data for this fold
        X_train_fold = X_train.iloc[train_indices]
        y_train_fold = y_train.iloc[train_indices]
        X_val_fold = X_train.iloc[val_indices]
        
        # Generate predictions for this fold
        fold_preds = pd.DataFrame(index=val_indices)
        test_fold_preds = {}
        
        # Train and predict with CatBoost
        if 'catboost' in models:
            print("Training fold with CatBoost...")
            if categorical_features:
                train_pool = Pool(X_train_fold, y_train_fold, cat_features=categorical_features)
                val_pool = Pool(X_val_fold, cat_features=categorical_features)
                test_pool = Pool(X_test, cat_features=categorical_features)
                
                temp_model = CatBoostRegressor()
                temp_model.set_params(**models['catboost'].get_params())
                temp_model.fit(train_pool, verbose=False)
                fold_preds['catboost'] = temp_model.predict(val_pool)
                test_fold_preds['catboost'] = temp_model.predict(test_pool)
            else:
                temp_model = CatBoostRegressor()
                temp_model.set_params(**models['catboost'].get_params())
                temp_model.fit(X_train_fold, y_train_fold, verbose=False)
                fold_preds['catboost'] = temp_model.predict(X_val_fold)
                test_fold_preds['catboost'] = temp_model.predict(X_test)
        
        # Train and predict with XGBoost
        if 'xgboost' in models:
            print("Training fold with XGBoost...")
            temp_model = XGBRegressor()
            temp_model.set_params(**models['xgboost'].get_params())
            temp_model.fit(X_train_fold, y_train_fold, verbose=False)
            fold_preds['xgboost'] = temp_model.predict(X_val_fold)
            test_fold_preds['xgboost'] = temp_model.predict(X_test)
        
        # Train and predict with LightGBM
        if 'lightgbm' in models:
            print("Training fold with LightGBM...")
            train_lgb = lgb.Dataset(X_train_fold, label=y_train_fold)
            params = models['lightgbm'].params if hasattr(models['lightgbm'], 'params') else {}
            temp_model = lgb.train(params, train_lgb, num_boost_round=100)
            fold_preds['lightgbm'] = temp_model.predict(X_val_fold)
            test_fold_preds['lightgbm'] = temp_model.predict(X_test)
        
        # Train and predict with MLP
        if 'mlp' in models:
            print("Training fold with MLP...")
            # Extract parameters from loaded model
            mlp_params = models['mlp']['model'].get_params()
            
            # Create and fit scaler
            scaler = StandardScaler()
            X_train_fold_scaled = scaler.fit_transform(X_train_fold)
            X_val_fold_scaled = scaler.transform(X_val_fold)
            X_test_scaled = scaler.transform(X_test)
            
            # Create and train MLP model
            temp_model = MLPRegressor(**mlp_params)
            temp_model.fit(X_train_fold_scaled, y_train_fold)
            
            fold_preds['mlp'] = temp_model.predict(X_val_fold_scaled)
            test_fold_preds['mlp'] = temp_model.predict(X_test_scaled)

        if 'torch_mlp' in models:
            print("Training fold with PyTorch MLP...")
            # Get parameters from the loaded model
            torch_mlp_info = models['torch_mlp']
            device = torch_mlp_info['device']
            
            # Create and fit scaler
            scaler = StandardScaler()
            X_train_fold_scaled = scaler.fit_transform(X_train_fold)
            X_val_fold_scaled = scaler.transform(X_val_fold)
            X_test_scaled = scaler.transform(X_test)
            
            # Create a new model with the same architecture
            input_dim = X_train_fold_scaled.shape[1]
            temp_model = MLPRegressorTorch(input_dim).to(device)
            
            # Convert to PyTorch tensors
            X_train_tensor = torch.tensor(X_train_fold_scaled, dtype=torch.float32).to(device)
            y_train_tensor = torch.tensor(y_train_fold.values.reshape(-1, 1), dtype=torch.float32).to(device)
            X_val_tensor = torch.tensor(X_val_fold_scaled, dtype=torch.float32).to(device)
            X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
            
            # Train the PyTorch model
            criterion = nn.L1Loss()
            optimizer = torch.optim.Adam(temp_model.parameters(), lr=0.001)
            batch_size = 64
            epochs = 100
            
            for epoch in range(epochs):
                # Create mini-batches
                perm = torch.randperm(X_train_tensor.size(0))
                for start in range(0, X_train_tensor.size(0), batch_size):
                    batch_indices = perm[start:start + batch_size]
                    batch_X = X_train_tensor[batch_indices]
                    batch_y = y_train_tensor[batch_indices]
                    
                    # Forward and backward pass
                    optimizer.zero_grad()
                    outputs = temp_model(batch_X)
                    loss = criterion(outputs, batch_y)
                    loss.backward()
                    optimizer.step()
            
            # Get predictions
            temp_model.eval()
            with torch.no_grad():
                fold_preds['torch_mlp'] = temp_model(X_val_tensor).cpu().numpy().flatten()
                test_fold_preds['torch_mlp'] = temp_model(X_test_tensor).cpu().numpy().flatten()
        
        # Add fold predictions to overall predictions DataFrame
        for model_name in fold_preds.columns:
            train_preds_df.loc[val_indices, model_name] = fold_preds[model_name].values
            
            if model_name not in test_preds_all:
                test_preds_all[model_name] = []
            test_preds_all[model_name].append(test_fold_preds[model_name])
    
    # Create test predictions by averaging fold predictions
    test_preds_df = pd.DataFrame()
    for model_name, preds_list in test_preds_all.items():
        test_preds_df[model_name] = np.mean(preds_list, axis=0)
    
    print(f"Generated level 1 features with shape: {train_preds_df.shape} (train), {test_preds_df.shape} (test)")
    return train_preds_df, test_preds_df



def train_meta_model(level1_train, y_train, level1_test):
    """
    Entrena y optimiza un meta-modelo ElasticNet sobre features de nivel-1,
    buscando minimizar el MAE mediante validación cruzada.
    """
    print("Buscando hiperparámetros óptimos para ElasticNet (minimizando MAE)...")
    
    # Definición de la malla de búsqueda
    param_grid = {
        'alpha':    [0.001, 0.01, 0.1, 1.0, 10.0],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    }
    
    base_model = ElasticNet(max_iter=10000, random_state=42)
    grid = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=1
    )
    
    # Ajuste de la búsqueda de hiperparámetros
    grid.fit(level1_train, y_train)
    
    best_model = grid.best_estimator_
    best_mae   = -grid.best_score_
    print(f"Mejores parámetros: {grid.best_params_}")
    print(f"MAE CV medio óptimo: {best_mae:.4f}")
    
    # Dividir para validación adicional
    X_tr, X_val, y_tr, y_val = train_test_split(
        level1_train, y_train, test_size=0.2, random_state=42
    )
    best_model.fit(X_tr, y_tr)
    val_preds = best_model.predict(X_val)
    
    val_mae  = mean_absolute_error(y_val, val_preds)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    val_r2   = r2_score(y_val, val_preds)
    
    print("Métricas en conjunto de validación:")
    print(f"  MAE : {val_mae:.4f}")
    print(f"  RMSE: {val_rmse:.4f}")
    print(f"  R²  : {val_r2:.4f}")
    
    # Reentrenar con los mejores parámetros sobre todo el dataset de entrenamiento
    best_model.fit(level1_train, y_train)
    
    # Predicciones sobre el conjunto de test
    test_preds = best_model.predict(level1_test)
    
    return best_model, test_preds


# Function to create submission file
def create_submission(test_ids, predictions, output_file):
    """
    Create submission file with predictions
    """
    submission_df = pd.DataFrame({
        'id': test_ids,
        'prezo_euros': predictions
    })
    
    submission_df.to_csv(output_file, index=False)
    print(f"Submission saved to {output_file}")
    return submission_df

# Main execution
if __name__ == "__main__":
    start_time = time.time()
    
    # Load data
    train_path = 'train_processed.csv'
    test_path = 'test_processed.csv'
    X_train, y_train, X_test, test_ids = load_data(train_path, test_path)
    
    # Get categorical features
    categorical_features = get_categorical_features(X_train)
    
    # Define paths to pretrained models
    model_paths = {
        'catboost': 'models_stacking/stacking_catboost_model.cbm',
        'xgboost': 'models_stacking/stacking_xgboost_model.json',
        #'lightgbm': 'models_stacking/stacking_lightgbm_model.txt',
        #'mlp': 'models_stacking/stacking_mlp_model.pkl',
        'torch_mlp': 'models_stacking/mlp_torch_model.pt'

        
    }
    
    # Load models
    models = load_models(model_paths)
    
    # Generate level 1 features
    level1_train, level1_test = generate_level1_features(
        models, X_train, X_test, y_train, categorical_features
    )
    
    # Train meta-model
    meta_model, test_preds = train_meta_model(level1_train, y_train, level1_test)
    
    # Create submission file
    create_submission(test_ids, test_preds, 'submissions_final_stacking.csv')
    
    # Save meta-model
    joblib.dump(meta_model, 'results_stacking/meta_model.pkl')
    print("Meta-model saved to results_stacking/meta_model.pkl")
    
    # End timing
    end_time = time.time()
    print(f"Total execution time: {end_time - start_time:.2f} seconds")

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Using cached torch-2.7.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

ModuleNotFoundError: No module named 'torch'