In [None]:
# upgrading the model3 by adding three more blocks to our nerual network structure 

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os
import shutil

# --- Configuration ---
DATA_FILE = '../regression_data.csv' # Make sure this path is correct

INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'owner_type', 'dam_type',
    'primary_purpose_s', 'eap', 'year_completed', 'latitude', 'longitude', 'year_modified'
]

NEW_TARGET_COLUMNS = [
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres',
    'incident_date_year', 'incident_date_month', 'incident_date_day',
    'incident_time_hour', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded',
    'volume_released_at_failure_ac_ft', 'incident_duration'
]

# --- Training Configuration ---
EPOCHS = 150
EARLY_STOPPING_PATIENCE = 15
# --- UPGRADED: Added patience for learning rate reduction ---
REDUCE_LR_PATIENCE = 5 

# --- UPGRADED: Model Building Function (More Powerful Architecture) ---
def build_model(input_shape):
    """Builds an upgraded, more powerful regression model."""
    # --- Define Hyperparameters ---
    units_1 = 512
    units_2 = 256
    units_3 = 128
    l2_reg = 0.001
    dropout_1 = 0.4 # Higher dropout for larger layer
    dropout_2 = 0.3
    dropout_3 = 0.2 # Lower dropout for smaller layer
    learning_rate = 0.001

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.InputLayer(input_shape=input_shape))

    # Block 1
    model.add(tf.keras.layers.Dense(units=units_1, kernel_regularizer=tf.keras.regularizers.l2(l2_reg)))
    # --- UPGRADED: Using LeakyReLU activation ---
    model.add(tf.keras.layers.LeakyReLU(alpha=0.1)) 
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_1))

    # Block 2
    model.add(tf.keras.layers.Dense(units=units_2, kernel_regularizer=tf.keras.regularizers.l2(l2_reg)))
    model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_2))
    
    # --- UPGRADED: Added a third dense block ---
    # Block 3
    model.add(tf.keras.layers.Dense(units=units_3, kernel_regularizer=tf.keras.regularizers.l2(l2_reg)))
    model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_3))

        # Block 4
    model.add(tf.keras.layers.Dense(units=units_3, kernel_regularizer=tf.keras.regularizers.l2(l2_reg)))
    model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_3))



        # Block 5
    model.add(tf.keras.layers.Dense(units=units_3, kernel_regularizer=tf.keras.regularizers.l2(l2_reg)))
    model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_3))



        # Block 6
    model.add(tf.keras.layers.Dense(units=units_3, kernel_regularizer=tf.keras.regularizers.l2(l2_reg)))
    model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(dropout_3))

    # Output Layer for Regression
    model.add(tf.keras.layers.Dense(1, activation='linear'))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss='mean_squared_error',
        metrics=['mae', 'mse']
    )
    return model

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, metrics_list):
    """Trains a regression neural network and collects performance metrics."""
    print(f"--- Processing target: {target_name} | Task Type: REGRESSION ---")

    # --- Preprocessing ---
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='passthrough'
    )

    # --- Train-Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # --- Standard Model Training ---
    input_shape = (X_train_processed.shape[1],)
    model = build_model(input_shape)

    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=EARLY_STOPPING_PATIENCE,
        restore_best_weights=True
    )
    
    # --- UPGRADED: Added ReduceLROnPlateau callback ---
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2, # Reduce LR by a factor of 5 (1/5 = 0.2)
        patience=REDUCE_LR_PATIENCE,
        min_lr=1e-6, # Don't let the learning rate get too small
        verbose=0 # Set to 1 to see messages when LR is reduced
    )

    history = model.fit(
        X_train_processed,
        y_train,
        epochs=EPOCHS,
        validation_split=0.2,
        # --- UPGRADED: Pass both callbacks to the model ---
        callbacks=[early_stopping, reduce_lr],
        verbose=0  # Set to 1 if you want to see training progress
    )
    print("Training complete.")

    # --- Evaluation & Reporting ---
    y_pred = model.predict(X_test_processed).flatten()

    # Calculate standard dimensional metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Calculate standard non-dimensional metrics
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)

    # Calculate Adjusted R-squared
    n = X_test_processed.shape[0]
    p = X_test_processed.shape[1]
    if n - p - 1 > 0:
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    else:
        adj_r2 = np.nan

    # Calculate Mean Absolute Percentage Error (MAPE)
    y_test_non_zero = y_test[y_test != 0]
    y_pred_non_zero = y_pred[y_test != 0]
    if len(y_test_non_zero) > 0:
        mape = np.mean(np.abs((y_test_non_zero - y_pred_non_zero) / y_test_non_zero)) * 100
    else:
        mape = np.nan
    
    metrics_result = {
        'Model Output': target_name,
        'Task Type': 'Regression',
        'MAE': mae,
        'MSE': mse,
        'R2_Score': r2,
        'Adjusted_R2_Score': adj_r2,
        'Explained_Variance_Score': evs,
        'MAPE (%)': mape
    }
    
    metrics_list.append(metrics_result)
    print(f"✅ Performance metrics for '{target_name}' collected.")

    # --- Generate Actual vs. Predicted Plot ---
    plt.figure(figsize=(8, 8))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2)
    plt.title(f'Actual vs. Predicted for {target_name}')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    
    plot_filename = f'plot_{target_name}.svg'
    plt.tight_layout()
    plt.savefig(plot_filename, format='svg')
    plt.close()
    print(f"Plot saved as {plot_filename}")

    # --- Save Detailed Report ---
    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y_test
    results_df['predicted_outcome'] = y_pred
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions', index=False)
    print(f"Detailed report saved as {report_filename}\n" + "-"*40 + "\n")

# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()

    model_metrics_data = []

    for target in NEW_TARGET_COLUMNS:
        if target not in df.columns:
            print(f"Warning: Target column '{target}' not found. Skipping.")
            continue
        
        if not pd.api.types.is_numeric_dtype(df[target]):
            print(f"Warning: Target column '{target}' is not numeric. Skipping regression task.")
            continue

        current_inputs = [col for col in INPUT_COLUMNS if col in df.columns]
        temp_df = df[current_inputs + [target]].dropna()
        
        if len(temp_df) < 50:
             print(f"Warning: Too little data for '{target}' after dropping NaNs. Skipping.")
             continue

        X_filtered = temp_df[current_inputs]
        y = temp_df[target]
        
        train_and_evaluate_model(X_filtered, y, target, model_metrics_data)

    if model_metrics_data:
        metrics_df = pd.DataFrame(model_metrics_data)
        metrics_filename = 'model_performance_metrics.xlsx'
        metrics_df.to_excel(metrics_filename, index=False)
        print(f"✅ All performance metrics saved to '{metrics_filename}'.")

    print("\nAll tasks complete.")


--- Processing target: dam_height | Task Type: REGRESSION ---

Training the model...


Training complete.
✅ Performance metrics for 'dam_height' collected.
Plot saved as plot_dam_height.svg
Detailed report saved as report_dam_height.xlsx
----------------------------------------

--- Processing target: max_storage_ac_ft | Task Type: REGRESSION ---
Training the model...
Training complete.
✅ Performance metrics for 'max_storage_ac_ft' collected.
Plot saved as plot_max_storage_ac_ft.svg
Detailed report saved as report_max_storage_ac_ft.xlsx
----------------------------------------

--- Processing target: surface_area_acres | Task Type: REGRESSION ---
Training the model...
Training complete.
✅ Performance metrics for 'surface_area_acres' collected.
Plot saved as plot_surface_area_acres.svg
Detailed report saved as report_surface_area_acres.xlsx
----------------------------------------

--- Processing target: incident_date_year | Task Type: REGRESSION ---
Training the model...
Training comp