In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl

try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("Error: The 'imbalanced-learn' library is required but not installed.")
    print("Please install it by running the following command in your terminal:")
    print("pip install imbalanced-learn")
    exit()


# --- Configuration ---
DATA_FILE = '../encoded_dam_data.csv'
INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'incident_date', 'incident_time',
    'incident_driver', 'owner_type', 'dam_type', 'primary_purpose_s', 'eap',
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres', 'year_completed',
    'latitude', 'longitude', 'year_modified'
]
OUTPUT_COLUMNS = [
    'incident_type', 'incident_mechanism_1', 'incident_mechanism_2',
    'incident_mechanism_3', 'eap_enacted_y_n_due_to_incident',
    'fatalities_number', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded', 'other_infrastructure_impacts',
    'response', 'volume_released_at_failure_ac_ft', 'incident_duration',
    'incident_report_produced'
]

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name):
    """
    Trains a neural network model for a given target variable, saves the model,
    and generates evaluation files (confusion matrix SVG and Excel report).
    """
    print(f"--- Processing target: {target_name} ---")

    # --- Pre-split Data Cleaning for Stratification ---
    # Stratified split requires at least 2 members per class.
    # We identify classes with only one sample and remove them.
    value_counts = y.value_counts()
    single_sample_classes = value_counts[value_counts < 2].index

    if not single_sample_classes.empty:
        print(f"Warning for target '{target_name}': The following classes have only 1 sample and will be removed: {list(single_sample_classes)}")
        # Keep only the data that does not belong to the single-sample classes
        original_count = len(y)
        mask = ~y.isin(single_sample_classes)
        X = X[mask].copy() # Use .copy() to avoid SettingWithCopyWarning
        y = y[mask].copy()
        print(f"Removed {original_count - len(y)} rows.")

    # If after cleaning, we have less than 2 classes, we can't classify.
    if y.nunique() < 2:
        print(f"Skipping '{target_name}' because it has less than 2 valid classes after cleaning.\n")
        return # Exit the function for this target


    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create preprocessing pipelines for numerical and categorical features
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a preprocessor object using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Convert target variable to categorical and then to integer codes
    y_series = pd.Series(y).astype('category')
    y_codes = y_series.cat.codes
    class_names = y_series.cat.categories.tolist()

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_codes, test_size=0.2, random_state=42, stratify=y_codes)

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Handle class imbalance using SMOTE
    # Note: SMOTE is applied only to the training data
    if len(class_names) > 1:
        # SMOTE requires the number of neighbors to be less than the number of samples in the smallest class.
        # We check the training data for the smallest class count.
        min_class_samples = pd.Series(y_train).value_counts().min()

        # We can only use SMOTE if the smallest class has at least 2 samples.
        if min_class_samples > 1:
            # We must set k_neighbors to be less than the number of samples in the smallest class.
            # We'll use the default of 5 if possible, otherwise we reduce it.
            k_neighbors = min(5, min_class_samples - 1)

            print("Applying SMOTE for class imbalance...")
            print(f"Smallest class in training set has {min_class_samples} samples. Adjusting SMOTE k_neighbors to {k_neighbors}.")
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
        else:
            print(f"Skipping SMOTE for '{target_name}': the smallest class in the training set has only {min_class_samples} sample(s).")
            X_train_resampled, y_train_resampled = X_train_processed, y_train
    else:
        X_train_resampled, y_train_resampled = X_train_processed, y_train


    # --- Build the Neural Network Model ---
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(class_names), activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # --- Train the Model ---
    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model.fit(X_train_resampled, y_train_resampled,
                        epochs=100,
                        validation_split=0.2,
                        callbacks=[early_stopping],
                        verbose=0) # Set to 1 to see training progress

    # --- Save the Trained Model ---
    model_filename = f'model_{target_name}.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

    # --- Evaluate the Model and Generate Confusion Matrix ---
    y_pred_proba = model.predict(X_test_processed)
    y_pred = np.argmax(y_pred_proba, axis=1)

    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # --- Save Confusion Matrix as SVG ---
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    svg_filename = f'confusion_matrix_{target_name}.svg'
    plt.savefig(svg_filename, format='svg')
    plt.close()
    print(f"Confusion matrix saved as {svg_filename}")

    # --- Save Results to Excel ---
    excel_filename = f'report_{target_name}.xlsx'
    with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
        cm_df.to_excel(writer, sheet_name='Confusion Matrix')
        X_test.assign(actual=y_series.iloc[y_test.index].values, predicted=y_series.cat.categories[y_pred]).to_excel(writer, sheet_name='Test Inputs and Predictions')
    print(f"Excel report saved as {excel_filename}")
    print("-" * 40 + "\n")


# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        print("Please make sure the CSV file is in the same directory as the script.")
        exit()

    # Drop rows where any of the target columns are missing
    df.dropna(subset=OUTPUT_COLUMNS, inplace=True)

    # Convert date/time columns to numerical features
    for col in ['incident_date', 'incident_time']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_day'] = df[col].dt.day
            if col == 'incident_time':
                 df[f'{col}_hour'] = df[col].dt.hour
            df.drop(columns=[col], inplace=True)
            # Add new generated columns to input columns list
            INPUT_COLUMNS.remove(col)
            INPUT_COLUMNS.extend([c for c in df.columns if c.startswith(col)])


    X = df[INPUT_COLUMNS]

    for target in OUTPUT_COLUMNS:
        if df[target].nunique() < 2:
            print(f"Skipping '{target}' because it has less than 2 unique values.")
            continue
        y = df[target]
        train_and_evaluate_model(X, y, target)

    print("All models have been trained and evaluated.")





--- Processing target: incident_type ---
Removed 1 rows.
Applying SMOTE for class imbalance...
Smallest class in training set has 389 samples. Adjusting SMOTE k_neighbors to 5.
Training the model...


  saving_api.save_model(


Model saved as model_incident_type.h5
Confusion matrix saved as confusion_matrix_incident_type.svg
Excel report saved as report_incident_type.xlsx
----------------------------------------

--- Processing target: incident_mechanism_1 ---
Removed 3 rows.
Applying SMOTE for class imbalance...
Smallest class in training set has 2 samples. Adjusting SMOTE k_neighbors to 1.
Training the model...


  saving_api.save_model(


Model saved as model_incident_mechanism_1.h5


ValueError: Shape of passed values is (26, 26), indices imply (27, 27)

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os

# Try to import imblearn, provide install instructions if it fails
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("Error: The 'imbalanced-learn' library is required but not installed.")
    print("Please install it by running the following command in your terminal:")
    print("pip install imbalanced-learn")
    exit()


# --- Configuration ---
DATA_FILE = '../encoded_dam_data.csv' # Make sure this path is correct
INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'incident_date', 'incident_time',
    'incident_driver', 'owner_type', 'dam_type', 'primary_purpose_s', 'eap',
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres', 'year_completed',
    'latitude', 'longitude', 'year_modified'
]
OUTPUT_COLUMNS = [
    'incident_type', 'incident_mechanism_1', 'incident_mechanism_2',
    'incident_mechanism_3', 'eap_enacted_y_n_due_to_incident',
    'fatalities_number', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded', 'other_infrastructure_impacts',
    'response', 'volume_released_at_failure_ac_ft', 'incident_duration',
    'incident_report_produced'
]

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, excel_writer):
    """
    Trains a neural network, saves the model, generates evaluation files,
    and writes the confusion matrix to a shared Excel writer object.
    """
    print(f"--- Processing target: {target_name} ---")

    # --- Pre-split Data Cleaning for Stratification ---
    # Stratified split requires at least 2 members per class.
    value_counts = y.value_counts()
    single_sample_classes = value_counts[value_counts < 2].index

    if not single_sample_classes.empty:
        print(f"Warning for target '{target_name}': Removing classes with only 1 sample: {list(single_sample_classes)}")
        original_count = len(y)
        mask = ~y.isin(single_sample_classes)
        X = X[mask].copy()
        y = y[mask].copy()
        print(f"Removed {original_count - len(y)} rows.")

    if y.nunique() < 2:
        print(f"Skipping '{target_name}' because it has fewer than 2 valid classes after cleaning.\n")
        return

    # Identify categorical and numerical features from the provided X
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns if any
    )

    # Convert target variable to categorical codes for the model
    y_series = pd.Series(y).astype('category')
    y_codes = y_series.cat.codes
    class_names = y_series.cat.categories.tolist()

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_codes, test_size=0.2, random_state=42, stratify=y_codes)

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Handle class imbalance using SMOTE on the training data only
    min_class_samples = pd.Series(y_train).value_counts().min()
    if y_series.nunique() > 1 and min_class_samples > 1:
        k_neighbors = min(5, min_class_samples - 1)
        print(f"Applying SMOTE... Smallest class in training set has {min_class_samples} samples. Using k_neighbors={k_neighbors}.")
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    else:
        print(f"Skipping SMOTE for '{target_name}': smallest class in training set has {min_class_samples} sample(s).")
        X_train_resampled, y_train_resampled = X_train_processed, y_train

    # --- Build the Neural Network Model ---
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(class_names), activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # --- Train the Model ---
    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train_resampled, y_train_resampled,
              epochs=100,
              validation_split=0.2,
              callbacks=[early_stopping],
              verbose=0) # Set to 1 to see training progress

    # --- Save the Trained Model ---
    model_filename = f'model_{target_name}.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

    # --- Evaluate the Model ---
    y_pred_proba = model.predict(X_test_processed)
    y_pred = np.argmax(y_pred_proba, axis=1)

    # Define the full range of possible class labels
    all_class_labels = range(len(class_names))
    # Create the confusion matrix using all possible labels to ensure correct shape
    cm = confusion_matrix(y_test, y_pred, labels=all_class_labels)
    
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # --- Save Confusion Matrix as SVG ---
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    svg_filename = f'confusion_matrix_{target_name}.svg'
    plt.savefig(svg_filename, format='svg')
    plt.close()
    print(f"Confusion matrix plot saved as {svg_filename}")

    # --- Write Confusion Matrix to the shared Excel file ---
    # The sheet name is sanitized to be valid
    safe_sheet_name = f'CM_{target_name[:25]}'
    cm_df.to_excel(excel_writer, sheet_name=safe_sheet_name)
    print(f"Confusion matrix data added to Excel sheet: '{safe_sheet_name}'")

    # --- Save individual detailed report to a separate Excel file ---
    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y.loc[X_test.index]
    results_df['predicted_outcome'] = [class_names[i] for i in y_pred]
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions')
    print(f"Detailed report saved as {report_filename}")
    print("-" * 40 + "\n")


# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        print("Please ensure the file path is correct.")
        exit()

    # Drop rows where any of the target columns are missing
    df.dropna(subset=OUTPUT_COLUMNS, inplace=True)

    # Create a copy of input columns to modify
    processed_input_cols = INPUT_COLUMNS.copy()

    # Convert date/time columns to numerical features
    for col in ['incident_date', 'incident_time']:
        if col in df.columns and col in processed_input_cols:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_day'] = df[col].dt.day
            new_cols = [f'{col}_year', f'{col}_month', f'{col}_day']
            if col == 'incident_time':
                df[f'{col}_hour'] = df[col].dt.hour
                new_cols.append(f'{col}_hour')
            
            # Update the list of columns to be used as inputs
            processed_input_cols.remove(col)
            processed_input_cols.extend(new_cols)
    
    # Drop rows with NaT in date columns after coercion
    df.dropna(subset=processed_input_cols, inplace=True)
    
    # Define X *after* processing columns
    # Ensure all columns exist in the dataframe before selection
    final_input_cols = [col for col in processed_input_cols if col in df.columns]
    X = df[final_input_cols]

    # Create a single Excel writer for all confusion matrices
    excel_cm_filename = 'all_confusion_matrices.xlsx'
    with pd.ExcelWriter(excel_cm_filename, engine='openpyxl') as writer:
        print(f"Starting model training loop. All confusion matrices will be saved in '{excel_cm_filename}'.\n")
        # Loop through each target variable and train a model
        for target in OUTPUT_COLUMNS:
            if df[target].nunique() < 2:
                print(f"Skipping '{target}' because it has less than 2 unique values.")
                continue
            y = df[target]
            # Pass the writer object to the function
            train_and_evaluate_model(X, y, target, writer)

    print("✅ All models have been trained and evaluated.")

Starting model training loop. All confusion matrices will be saved in 'all_confusion_matrices.xlsx'.

--- Processing target: incident_type ---
Removed 1 rows.
Applying SMOTE... Smallest class in training set has 389 samples. Using k_neighbors=5.
Training the model...
Model saved as model_incident_type.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_type.svg
Confusion matrix data added to Excel sheet: 'CM_incident_type'
Detailed report saved as report_incident_type.xlsx
----------------------------------------

--- Processing target: incident_mechanism_1 ---
Removed 3 rows.
Applying SMOTE... Smallest class in training set has 2 samples. Using k_neighbors=1.
Training the model...
Model saved as model_incident_mechanism_1.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_1.svg
Confusion matrix data added to Excel sheet: 'CM_incident_mechanism_1'
Detailed report saved as report_incident_mechanism_1.xlsx
----------------------------------------

--- Processing target: incident_mechanism_2 ---
Removed 5 rows.
Applying SMOTE... Smallest class in training set has 2 samples. Using k_neighbors=1.
Training the model...
Model saved as model_incident_mechanism_2.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_2.svg
Confusion matrix data added to Excel sheet: 'CM_incident_mechanism_2'
Detailed report saved as report_incident_mechanism_2.xlsx
----------------------------------------

--- Processing target: incident_mechanism_3 ---
Removed 7 rows.
Applying SMOTE... Smallest class in training set has 2 samples. Using k_neighbors=1.
Training the model...



KeyboardInterrupt

