# Upgrading our random forest 
Here are the main upgrades:

Automated Hyperparameter Tuning: Instead of using fixed settings for the Random Forest, we'll use RandomizedSearchCV. This technique intelligently searches for the best combination of model parameters (n_estimators, max_depth, etc.), which almost always results in a more accurate model.

Robust Cross-Validation: The search process uses 5-fold cross-validation (cv=5), meaning it trains and tests the model 5 times on different subsets of your data. This ensures the model's performance is stable and not just a result of a "lucky" train-test split.

Feature Importance Analysis: After finding the best model, we'll extract and plot the feature importances. This tells you exactly which input columns (e.g., dam_type, year_completed) are the most influential in predicting the target, which is crucial for understanding the why behind the model's logic. 🧐

Enhanced Reporting: The most important features and the best-found hyperparameters are now saved in the output files for easy reference

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
import os
import shutil

# --- Configuration ---
DATA_FILE = '../regression_data.csv' # Make sure this path is correct

INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'owner_type', 'dam_type',
    'primary_purpose_s', 'eap', 'year_completed', 'latitude', 'longitude', 'year_modified'
]

NEW_TARGET_COLUMNS = [
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres',
    'incident_date_year', 'incident_date_month', 'incident_date_day',
    'incident_time_hour', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded',
    'volume_released_at_failure_ac_ft', 'incident_duration'
]

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, metrics_list):
    """
    Finds the best RandomForestRegressor using RandomizedSearchCV, evaluates it,
    and analyzes feature importances.
    """
    print(f"--- Processing target: {target_name} | Task Type: REGRESSION (Random Forest with Tuning) ---")

    # --- Preprocessing ---
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='passthrough',
        verbose_feature_names_out=False # Keeps original feature names cleaner
    )

    # --- Train-Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    # Get the final feature names after one-hot encoding for importance plotting
    processed_feature_names = preprocessor.get_feature_names_out()


    # --- UPGRADE: Hyperparameter Tuning with RandomizedSearchCV ---
    print("Searching for the best hyperparameters...")
    
    # Define the parameter distribution to search over
    param_dist = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', 1.0] # Use '1.0' for all features (equivalent to old 'auto')
    }

    # Instantiate the base model
    rf = RandomForestRegressor(random_state=42)

    # Set up the Randomized Search with 5-fold cross-validation
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=50,  # Number of parameter settings that are sampled. Increase for better results, decrease for speed.
        cv=5,       # 5-fold cross-validation
        verbose=0,  # Set to 1 or 2 to see the search progress
        random_state=42,
        n_jobs=-1   # Use all available CPU cores
    )

    # Execute the search on the training data
    random_search.fit(X_train_processed, y_train)

    # The best model found by the search
    best_model = random_search.best_estimator_
    
    print(f"Best hyperparameters found: {random_search.best_params_}")
    print("Training complete with the best model.")

    # --- Evaluation & Reporting using the best model ---
    y_pred = best_model.predict(X_test_processed)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)
    n, p = X_test_processed.shape
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1) if (n - p - 1) > 0 else np.nan
    
    y_test_non_zero_mask = y_test != 0
    if np.any(y_test_non_zero_mask):
        mape = np.mean(np.abs((y_test[y_test_non_zero_mask] - y_pred[y_test_non_zero_mask]) / y_test[y_test_non_zero_mask])) * 100
    else:
        mape = np.nan

    metrics_result = {
        'Model Output': target_name,
        'Task Type': 'Regression (Tuned RF)',
        'MAE': mae, 'MSE': mse, 'R2_Score': r2, 'Adjusted_R2_Score': adj_r2,
        'Explained_Variance_Score': evs, 'MAPE (%)': mape
    }
    metrics_list.append(metrics_result)
    print(f"✅ Performance metrics for '{target_name}' collected.")

    # --- UPGRADE: Feature Importance Analysis ---
    importances = best_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': processed_feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # Plot top 15 features
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')
    plt.title(f'Top 15 Feature Importances for {target_name}')
    plt.tight_layout()
    plot_filename_importance = f'plot_importance_{target_name}.svg'
    plt.savefig(plot_filename_importance, format='svg')
    plt.close()
    print(f"Feature importance plot saved as {plot_filename_importance}")


    # --- Generate Actual vs. Predicted Plot ---
    plt.figure(figsize=(8, 8))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2, label='Ideal Fit')
    plt.title(f'Actual vs. Predicted for {target_name} (Tuned Random Forest)')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.legend()
    plot_filename_pred = f'plot_pred_{target_name}.svg'
    plt.tight_layout()
    plt.savefig(plot_filename_pred, format='svg')
    plt.close()
    print(f"Prediction plot saved as {plot_filename_pred}")

    # --- Save Detailed Report ---
    report_filename = f'report_{target_name}.xlsx'
    
    # Create DataFrames for each sheet
    results_df = X_test.copy()
    results_df['actual_outcome'] = y_test
    results_df['predicted_outcome'] = y_pred
    
    summary_data = {
        'Metric': list(metrics_result.keys()),
        'Value': list(metrics_result.values())
    }
    summary_df = pd.DataFrame(summary_data)
    
    best_params_df = pd.DataFrame([random_search.best_params_])

    # Write to an Excel file with multiple sheets
    with pd.ExcelWriter(report_filename, engine='openpyxl') as writer:
        results_df.to_excel(writer, sheet_name='Test_Inputs_and_Predictions', index=False)
        feature_importance_df.to_excel(writer, sheet_name='Feature_Importances', index=False)
        summary_df.to_excel(writer, sheet_name='Performance_Metrics', index=False)
        best_params_df.to_excel(writer, sheet_name='Best_Hyperparameters', index=False)

    print(f"Detailed report saved to {report_filename}\n" + "-"*50 + "\n")


# --- Main Execution (No changes needed below) ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()

    model_metrics_data = []

    for target in NEW_TARGET_COLUMNS:
        if target not in df.columns:
            print(f"Warning: Target column '{target}' not found. Skipping.")
            continue
        
        if not pd.api.types.is_numeric_dtype(df[target]):
            print(f"Warning: Target column '{target}' is not numeric. Skipping regression task.")
            continue

        current_inputs = [col for col in INPUT_COLUMNS if col in df.columns]
        temp_df = df[current_inputs + [target]].dropna()
        
        if len(temp_df) < 50:
             print(f"Warning: Too little data for '{target}' after dropping NaNs. Skipping.")
             continue

        X_filtered = temp_df[current_inputs]
        y = temp_df[target]
        
        train_and_evaluate_model(X_filtered, y, target, model_metrics_data)

    if model_metrics_data:
        metrics_df = pd.DataFrame(model_metrics_data)
        metrics_filename = 'model_performance_metrics.xlsx'
        metrics_df.to_excel(metrics_filename, index=False)
        print(f"✅ All performance metrics saved to '{metrics_filename}'.")

    print("\nAll tasks complete.")

--- Processing target: dam_height | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30}
Training complete with the best model.
✅ Performance metrics for 'dam_height' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_dam_height.svg
Prediction plot saved as plot_pred_dam_height.svg
Detailed report saved to report_dam_height.xlsx
--------------------------------------------------

--- Processing target: max_storage_ac_ft | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10}
Training complete with the best model.
✅ Performance metrics for 'max_storage_ac_ft' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_max_storage_ac_ft.svg
Prediction plot saved as plot_pred_max_storage_ac_ft.svg
Detailed report saved to report_max_storage_ac_ft.xlsx
--------------------------------------------------

--- Processing target: surface_area_acres | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10}
Training complete with the best model.
✅ Performance metrics for 'surface_area_acres' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_surface_area_acres.svg
Prediction plot saved as plot_pred_surface_area_acres.svg
Detailed report saved to report_surface_area_acres.xlsx
--------------------------------------------------

--- Processing target: incident_date_year | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'max_depth': 20}
Training complete with the best model.
✅ Performance metrics for 'incident_date_year' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_incident_date_year.svg
Prediction plot saved as plot_pred_incident_date_year.svg
Detailed report saved to report_incident_date_year.xlsx
--------------------------------------------------

--- Processing target: incident_date_month | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'max_depth': 20}
Training complete with the best model.
✅ Performance metrics for 'incident_date_month' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_incident_date_month.svg
Prediction plot saved as plot_pred_incident_date_month.svg
Detailed report saved to report_incident_date_month.xlsx
--------------------------------------------------

--- Processing target: incident_date_day | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'max_depth': 20}
Training complete with the best model.
✅ Performance metrics for 'incident_date_day' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_incident_date_day.svg
Prediction plot saved as plot_pred_incident_date_day.svg
Detailed report saved to report_incident_date_day.xlsx
--------------------------------------------------

--- Processing target: incident_time_hour | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'max_depth': 20}
Training complete with the best model.
✅ Performance metrics for 'incident_time_hour' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_incident_time_hour.svg
Prediction plot saved as plot_pred_incident_time_hour.svg
Detailed report saved to report_incident_time_hour.xlsx
--------------------------------------------------

--- Processing target: number_of_people_evacuated | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20}
Training complete with the best model.
✅ Performance metrics for 'number_of_people_evacuated' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_number_of_people_evacuated.svg
Prediction plot saved as plot_pred_number_of_people_evacuated.svg
Detailed report saved to report_number_of_people_evacuated.xlsx
--------------------------------------------------

--- Processing target: number_of_habitable_structures_evacuated | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20}
Training complete with the best model.
✅ Performance metrics for 'number_of_habitable_structures_evacuated' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_number_of_habitable_structures_evacuated.svg
Prediction plot saved as plot_pred_number_of_habitable_structures_evacuated.svg
Detailed report saved to report_number_of_habitable_structures_evacuated.xlsx
--------------------------------------------------

--- Processing target: number_of_habitable_structures_flooded | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None}
Training complete with the best model.
✅ Performance metrics for 'number_of_habitable_structures_flooded' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_number_of_habitable_structures_flooded.svg
Prediction plot saved as plot_pred_number_of_habitable_structures_flooded.svg
Detailed report saved to report_number_of_habitable_structures_flooded.xlsx
--------------------------------------------------

--- Processing target: volume_released_at_failure_ac_ft | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30}
Training complete with the best model.
✅ Performance metrics for 'volume_released_at_failure_ac_ft' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_volume_released_at_failure_ac_ft.svg
Prediction plot saved as plot_pred_volume_released_at_failure_ac_ft.svg
Detailed report saved to report_volume_released_at_failure_ac_ft.xlsx
--------------------------------------------------

--- Processing target: incident_duration | Task Type: REGRESSION (Random Forest with Tuning) ---
Searching for the best hyperparameters...
Best hyperparameters found: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10}
Training complete with the best model.
✅ Performance metrics for 'incident_duration' collected.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')


Feature importance plot saved as plot_importance_incident_duration.svg
Prediction plot saved as plot_pred_incident_duration.svg
Detailed report saved to report_incident_duration.xlsx
--------------------------------------------------

✅ All performance metrics saved to 'model_performance_metrics.xlsx'.

All tasks complete.


In [1]:
print("g")

g
