# Changing model to random forest
The key changes include:

Removing TensorFlow: All tensorflow imports and related configurations (epochs, callbacks, etc.) have been removed.

Adding RandomForestRegressor: The script now imports and uses RandomForestRegressor from sklearn.ensemble.

Updating the Model Function: The build_model function has been replaced with build_random_forest_model, which instantiates the Random Forest model with some common hyperparameters.

Simplifying Training: The model training step (model.fit) is now a single line, as Random Forest doesn't require epochs or validation splits in the same way a neural network does.

This version will be significantly faster to run as it doesn't require iterative training

In [2]:
import pandas as pd
import numpy as np
# --- MODIFIED: Import RandomForestRegressor and remove TensorFlow ---
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os
import shutil

# --- Configuration ---
DATA_FILE = '../regression_data.csv' # Make sure this path is correct

INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'owner_type', 'dam_type',
    'primary_purpose_s', 'eap', 'year_completed', 'latitude', 'longitude', 'year_modified'
]

NEW_TARGET_COLUMNS = [
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres',
    'incident_date_year', 'incident_date_month', 'incident_date_day',
    'incident_time_hour', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded',
    'volume_released_at_failure_ac_ft', 'incident_duration'
]

# --- REMOVED: Keras/TensorFlow specific training configurations ---

# --- REPLACED: Model Building Function (Now for Random Forest) ---
def build_random_forest_model():
    """Builds a RandomForestRegressor model with sensible defaults."""
    # Define hyperparameters for the Random Forest
    # n_estimators: The number of trees in the forest.
    # max_depth: The maximum depth of the tree.
    # min_samples_leaf: The minimum number of samples required to be at a leaf node.
    # n_jobs: The number of jobs to run in parallel. -1 means using all available processors.
    model = RandomForestRegressor(
        n_estimators=200,          # Increased number of trees for better performance
        max_depth=None,            # Allow trees to grow deep
        min_samples_leaf=2,        # Requires at least 2 samples in a leaf
        random_state=42,           # For reproducibility
        n_jobs=-1                  # Use all available CPU cores
    )
    return model

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, metrics_list):
    """Trains a RandomForestRegressor and collects performance metrics."""
    print(f"--- Processing target: {target_name} | Task Type: REGRESSION (Random Forest) ---")

    # --- Preprocessing ---
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns
    preprocessor = ColumnTransformer(
        transformers=[
            # Note: Scaling is less critical for Random Forest but kept for consistency
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='passthrough'
    )

    # --- Train-Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # --- MODIFIED: Random Forest Model Training ---
    model = build_random_forest_model()

    print("Training the Random Forest model...")
    # The fit method is much simpler, no epochs or callbacks needed
    model.fit(X_train_processed, y_train)
    print("Training complete.")

    # --- Evaluation & Reporting ---
    # The .flatten() is not strictly necessary as predict() returns a 1D array
    y_pred = model.predict(X_test_processed)

    # Calculate standard dimensional metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Calculate standard non-dimensional metrics
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)

    # Calculate Adjusted R-squared
    n = X_test_processed.shape[0]
    p = X_test_processed.shape[1]
    if n - p - 1 > 0:
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    else:
        adj_r2 = np.nan

    # Calculate Mean Absolute Percentage Error (MAPE)
    # Avoid division by zero
    y_test_non_zero_mask = y_test != 0
    y_test_non_zero = y_test[y_test_non_zero_mask]
    y_pred_non_zero = y_pred[y_test_non_zero_mask]
    
    if len(y_test_non_zero) > 0:
        mape = np.mean(np.abs((y_test_non_zero - y_pred_non_zero) / y_test_non_zero)) * 100
    else:
        mape = np.nan
    
    metrics_result = {
        'Model Output': target_name,
        'Task Type': 'Regression (Random Forest)',
        'MAE': mae,
        'MSE': mse,
        'R2_Score': r2,
        'Adjusted_R2_Score': adj_r2,
        'Explained_Variance_Score': evs,
        'MAPE (%)': mape
    }
    
    metrics_list.append(metrics_result)
    print(f"✅ Performance metrics for '{target_name}' collected.")

    # --- Generate Actual vs. Predicted Plot ---
    plt.figure(figsize=(8, 8))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2, label='Ideal Fit')
    plt.title(f'Actual vs. Predicted for {target_name} (Random Forest)')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.legend()
    
    plot_filename = f'plot_{target_name}.svg'
    plt.tight_layout()
    plt.savefig(plot_filename, format='svg')
    plt.close()
    print(f"Plot saved as {plot_filename}")

    # --- Save Detailed Report ---
    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y_test
    results_df['predicted_outcome'] = y_pred
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions', index=False)
    print(f"Detailed report saved as {report_filename}\n" + "-"*40 + "\n")

# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()

    model_metrics_data = []

    for target in NEW_TARGET_COLUMNS:
        if target not in df.columns:
            print(f"Warning: Target column '{target}' not found. Skipping.")
            continue
        
        if not pd.api.types.is_numeric_dtype(df[target]):
            print(f"Warning: Target column '{target}' is not numeric. Skipping regression task.")
            continue

        current_inputs = [col for col in INPUT_COLUMNS if col in df.columns]
        temp_df = df[current_inputs + [target]].dropna()
        
        if len(temp_df) < 50:
             print(f"Warning: Too little data for '{target}' after dropping NaNs. Skipping.")
             continue

        X_filtered = temp_df[current_inputs]
        y = temp_df[target]
        
        train_and_evaluate_model(X_filtered, y, target, model_metrics_data)

    if model_metrics_data:
        metrics_df = pd.DataFrame(model_metrics_data)
        metrics_filename = 'model_performance_metrics.xlsx'
        metrics_df.to_excel(metrics_filename, index=False)
        print(f"✅ All performance metrics saved to '{metrics_filename}'.")

    print("\nAll tasks complete.")

--- Processing target: dam_height | Task Type: REGRESSION (Random Forest) ---
Training the Random Forest model...
Training complete.
✅ Performance metrics for 'dam_height' collected.
Plot saved as plot_dam_height.svg
Detailed report saved as report_dam_height.xlsx
----------------------------------------

--- Processing target: max_storage_ac_ft | Task Type: REGRESSION (Random Forest) ---
Training the Random Forest model...
Training complete.
✅ Performance metrics for 'max_storage_ac_ft' collected.
Plot saved as plot_max_storage_ac_ft.svg
Detailed report saved as report_max_storage_ac_ft.xlsx
----------------------------------------

--- Processing target: surface_area_acres | Task Type: REGRESSION (Random Forest) ---
Training the Random Forest model...
Training complete.
✅ Performance metrics for 'surface_area_acres' collected.
Plot saved as plot_surface_area_acres.svg
Detailed report saved as report_surface_area_acres.xlsx
----------------------------------------

--- Processing targ