In [2]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
import lightgbm as lgbm
from sklearn.neural_network import MLPRegressor

# Create results directory
os.makedirs('results_stacking', exist_ok=True)

# Function to load data
def load_data(train_path, test_path):
    """
    Load training and test data
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # For training data
    X_train = train_df.drop(['prezo_euros', 'id'], axis=1, errors='ignore')
    y_train = train_df['prezo_euros']
    
    # For test data
    X_test = test_df.drop(['id', 'Unnamed: 0'], axis=1, errors='ignore')
    test_ids = test_df['id']
    
    print(f"Training data shape: {X_train.shape}")
    print(f"Test data shape: {X_test.shape}")
    
    return X_train, y_train, X_test, test_ids

# Function to load pretrained models
def load_models(model_paths):
    """
    Load pretrained models from specified paths
    
    Parameters:
    -----------
    model_paths : dict
        Dictionary with model names as keys and file paths as values
        
    Returns:
    --------
    dict
        Dictionary of loaded models
    """
    models = {}
    
    # Load CatBoost model
    if 'catboost' in model_paths:
        print("Loading CatBoost model...")
        models['catboost'] = CatBoostRegressor()
        models['catboost'].load_model(model_paths['catboost'])
    
    # Load XGBoost model
    if 'xgboost' in model_paths:
        print("Loading XGBoost model...")
        models['xgboost'] = XGBRegressor()
        models['xgboost'].load_model(model_paths['xgboost'])
    
    # Load LightGBM model
    if 'lightgbm' in model_paths:
        print("Loading LightGBM model...")
        models['lightgbm'] = lgbm.Booster(model_file=model_paths['lightgbm'])
        
    # Load MLP model
    if 'mlp' in model_paths:
        print("Loading MLP model...")
        models['mlp'] = joblib.load(model_paths['mlp'])
    
    print(f"Successfully loaded {len(models)} models")
    return models

# Function to generate level 1 features
def generate_level1_features(models, X_train, X_test, y_train, categorical_features=None):
    """
    Generate level 1 training features for meta-model using cross-validation
    
    Parameters:
    -----------
    models : dict
        Dictionary of pretrained models
    X_train : DataFrame
        Training features
    X_test : DataFrame
        Test features
    y_train : Series
        Target variable
    categorical_features : list, optional
        List of categorical feature names
        
    Returns:
    --------
    DataFrame, DataFrame
        Level 1 features for training and test sets
    """
    print("Generating level 1 features for stacking...")
    
    # For training set, use k-fold cross-validation to avoid data leakage
    k = 5
    train_preds = pd.DataFrame()
    test_preds_all = []
    
    # Create folds
    n_samples = X_train.shape[0]
    fold_size = n_samples // k
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for fold in range(k):
        print(f"Processing fold {fold+1}/{k}")
        # Get indices for this fold
        start_idx = fold * fold_size
        end_idx = (fold + 1) * fold_size if fold < k - 1 else n_samples
        val_indices = indices[start_idx:end_idx]
        train_indices = np.setdiff1d(indices, val_indices)
        
        # Split data for this fold
        X_train_fold = X_train.iloc[train_indices]
        y_train_fold = y_train.iloc[train_indices]
        X_val_fold = X_train.iloc[val_indices]
        
        # Generate predictions for this fold
        fold_preds = pd.DataFrame(index=val_indices)
        test_fold_preds = {}
        
        for name, model in models.items():
            # Training set predictions
            if name == 'catboost':
                if categorical_features:
                    train_pool = Pool(X_train_fold, y_train_fold, cat_features=categorical_features)
                    val_pool = Pool(X_val_fold, cat_features=categorical_features)
                    temp_model = CatBoostRegressor()
                    temp_model.set_params(**model.get_params())
                    temp_model.fit(train_pool, verbose=False)
                    fold_preds[name] = temp_model.predict(val_pool)
                    test_fold_preds[name] = temp_model.predict(X_test)
                else:
                    temp_model = CatBoostRegressor()
                    temp_model.set_params(**model.get_params())
                    temp_model.fit(X_train_fold, y_train_fold, verbose=False)
                    fold_preds[name] = temp_model.predict(X_val_fold)
                    test_fold_preds[name] = temp_model.predict(X_test)
            
            elif name == 'xgboost':
                temp_model = XGBRegressor()
                temp_model.set_params(**model.get_params())
                temp_model.fit(X_train_fold, y_train_fold, verbose=False)
                fold_preds[name] = temp_model.predict(X_val_fold)
                test_fold_preds[name] = temp_model.predict(X_test)
            
            elif name == 'lightgbm':
                # For LightGBM we need to handle differently as it's a Booster object
                train_data = lgbm.Dataset(X_train_fold, y_train_fold)
                params = {
                    'objective': 'regression',
                    'metric': 'mae',
                    'verbosity': -1
                }
                temp_model = lgbm.train(params, train_data, num_boost_round=100)
                fold_preds[name] = temp_model.predict(X_val_fold)
                test_fold_preds[name] = temp_model.predict(X_test)
            
            elif name == 'mlp':
                temp_model = MLPRegressor(random_state=42)
                if hasattr(model, 'get_params'):
                    temp_model.set_params(**model.get_params())
                temp_model.fit(X_train_fold, y_train_fold)
                fold_preds[name] = temp_model.predict(X_val_fold)
                test_fold_preds[name] = temp_model.predict(X_test)
        
        # Save fold predictions
        fold_preds = fold_preds.sort_index()  # Sort by original indices
        train_preds = pd.concat([train_preds, fold_preds])
        
        # Accumulate test predictions
        if not test_preds_all:
            test_preds_all = {k: v for k, v in test_fold_preds.items()}
        else:
            for name in test_fold_preds:
                test_preds_all[name] += test_fold_preds[name]
    
    # Average test predictions across folds
    test_preds = pd.DataFrame()
    for name in test_preds_all:
        test_preds[name] = test_preds_all[name] / k
    
    # Final predictions for training and test sets
    train_preds = train_preds.sort_index()  # Ensure correct order
    
    print(f"Level 1 features shape - training: {train_preds.shape}, test: {test_preds.shape}")
    return train_preds, test_preds

# Function to train meta-model
def train_meta_model(level1_train, y_train, level1_test):
    """
    Train meta-model for final predictions
    
    Parameters:
    -----------
    level1_train : DataFrame
        Level 1 features for training
    y_train : Series
        Target variable
    level1_test : DataFrame
        Level 1 features for test set
        
    Returns:
    --------
    model, array
        Trained meta-model and predictions for test set
    """
    print("Training meta-model...")
    
    # Split training data for meta-model validation
    X_meta_train, X_meta_val, y_meta_train, y_meta_val = train_test_split(
        level1_train, y_train, test_size=0.2, random_state=42
    )
    
    # Initialize and train meta-model (Ridge regression)
    meta_model = Ridge(alpha=1.0)
    meta_model.fit(X_meta_train, y_meta_train)
    
    # Validate meta-model
    meta_val_preds = meta_model.predict(X_meta_val)
    meta_mae = mean_absolute_error(y_meta_val, meta_val_preds)
    meta_rmse = np.sqrt(mean_squared_error(y_meta_val, meta_val_preds))
    meta_r2 = r2_score(y_meta_val, meta_val_preds)
    
    print(f"Meta-model validation metrics:")
    print(f"  MAE: {meta_mae:.2f}")
    print(f"  RMSE: {meta_rmse:.2f}")
    print(f"  R2: {meta_r2:.4f}")
    
    # Generate predictions for test set
    test_preds = meta_model.predict(level1_test)
    
    # Check feature importances in meta-model
    feature_importance = meta_model.coef_
    importance_df = pd.DataFrame({
        'Feature': level1_train.columns,
        'Importance': feature_importance
    })
    importance_df = importance_df.sort_values('Importance', ascending=False)
    print("\nMeta-model feature importances:")
    print(importance_df)
    
    return meta_model, test_preds

# Function to create submission file
def create_submission(test_ids, predictions, output_file):
    """
    Create submission file
    
    Parameters:
    -----------
    test_ids : Series
        IDs for test set
    predictions : array
        Predicted values
    output_file : str
        Path to output file
    """
    submission = pd.DataFrame({
        'id': test_ids,
        'prezo_euros': predictions
    })
    submission.to_csv(output_file, index=False)
    print(f"Submission file created: {output_file}")

In [4]:
start_time = time.time()

# Load data
train_path = 'train_processed.csv'
test_path = 'test_processed.csv'
X_train, y_train, X_test, test_ids = load_data(train_path, test_path)

# Get categorical features
categorical_features = []
for col in X_train.columns:
    if (X_train[col].dtype == 'object' or 
        col in ['tipo_edificacion', 'calidade_materiais', 
                'cor_favorita_propietario', 'acceso_transporte_publico',
                'orientacion', 'eficiencia_enerxetica'] or
        'tipo_' in col or 'color_' in col):
        categorical_features.append(col)

print(f"Categorical features: {categorical_features}")

# Define paths to pretrained models
model_paths = {
    'catboost': 'results_initial_hyper/best_model_long.cbm',
    #'xgboost': 'models/xgboost_model.json',
    #'lightgbm': 'models/lightgbm_model.txt',
    #'mlp': 'models/mlp_model.pkl'
}

# Load models
models = load_models(model_paths)

# Generate level 1 features
level1_train, level1_test = generate_level1_features(
    models, X_train, X_test, y_train, categorical_features
)

# Train meta-model
meta_model, test_preds = train_meta_model(level1_train, y_train, level1_test)

# Create submission file
create_submission(test_ids, test_preds, 'submissions_final_stacking.csv')

# Save meta-model
joblib.dump(meta_model, 'results_stacking/meta_model.pkl')
print("Meta-model saved to results_stacking/meta_model.pkl")

# End timing
end_time = time.time()
print(f"Total execution time: {end_time - start_time:.2f} seconds")

Training data shape: (20000, 45)
Test data shape: (10000, 45)
Categorical features: ['tipo_Apartamento', 'tipo_Casa', 'tipo_Chalet adosado', 'color_Amarelo', 'color_Azul', 'color_Branco', 'color_Negro', 'color_Verde', 'color_Vermello', 'tipo_Apartamento.1', 'tipo_Casa.1', 'tipo_Chalet adosado.1', 'color_Amarelo.1', 'color_Azul.1', 'color_Branco.1', 'color_Negro.1', 'color_Verde.1', 'color_Vermello.1']
Loading CatBoost model...
Successfully loaded 1 models
Generating level 1 features for stacking...
Processing fold 1/5
