In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Create directories for results
os.makedirs('models_stacking', exist_ok=True)
os.makedirs('results_randomforest', exist_ok=True)

# Function to load and prepare data (reuse from LightGBM script)
def load_data(file_path):
    df = pd.read_csv(file_path)
    print(f"Missing values in dataset: {df.isnull().sum().sum()}")
    
    # Identify categorical features
    categorical_features = []
    for col in df.columns:
        if df[col].dtype == 'object' or col.startswith('tipo_') or col.startswith('color_') \
           or col in ['tipo_edificacion', 'calidade_materiais', 'cor_favorita_propietario',
                      'acceso_transporte_publico', 'orientacion', 'eficiencia_enerxetica']:
            categorical_features.append(col)
    print(f"Detected categorical features: {categorical_features}")

    X = df.drop(['prezo_euros', 'id'], axis=1, errors='ignore')
    y = df['prezo_euros']
    print(f"Dataset shape: {df.shape}")
    print(f"Features used: {X.columns.tolist()}")
    return X, y, categorical_features

# Function to train RandomForest model for stacking
def train_randomforest_model(X, y, random_state=42):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=random_state
    )
    print("\n=== Training RandomForest Regressor ===")

    # Parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['auto', 'sqrt']
    }

    rf = RandomForestRegressor(random_state=random_state, n_jobs=-1)
    print("Performing GridSearchCV for RandomForest...")
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    best_rf = grid_search.best_estimator_

    # Evaluate on test set
    y_pred = best_rf.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print("\nRandomForest Performance:")
    print(f"  MAE: {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R2: {r2:.4f}")

    # Feature importance
    importances = best_rf.feature_importances_
    features = X.columns
    importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
    importance_df = importance_df.sort_values('Importance', ascending=False)

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title('Top 20 Feature Importances (RandomForest)', fontsize=15)
    plt.tight_layout()
    plt.savefig('results_randomforest/rf_feature_importance.png')
    print("Feature importance plot saved to 'results_randomforest/rf_feature_importance.png'")

    return best_rf

# Main execution
if __name__ == "__main__":
    start_time = time.time()

    # Load data
    X, y, cat_features = load_data('train_processed.csv')

    # Train RandomForest model
    rf_model = train_randomforest_model(X, y)

    # Save model for stacking
    joblib.dump(rf_model, 'models_stacking/randomforest_model.pkl')
    print("Model saved as 'models_stacking/randomforest_model.pkl'")

    end_time = time.time()
    print(f"Total execution time: {end_time - start_time:.2f} seconds")