In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os

# Try to import imblearn, provide install instructions if it fails
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("Error: The 'imbalanced-learn' library is required but not installed.")
    print("Please install it by running the following command in your terminal:")
    print("pip install imbalanced-learn")
    exit()


# --- Configuration ---
DATA_FILE = '../encoded_dam_data.csv' # Make sure this path is correct
INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'incident_date', 'incident_time',
    'incident_driver', 'owner_type', 'dam_type', 'primary_purpose_s', 'eap',
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres', 'year_completed',
    'latitude', 'longitude', 'year_modified'
]
OUTPUT_COLUMNS = [
    'incident_type', 'incident_mechanism_1', 'incident_mechanism_2',
    'incident_mechanism_3', 'eap_enacted_y_n_due_to_incident',
    'fatalities_number', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded', 'other_infrastructure_impacts',
    'response', 'volume_released_at_failure_ac_ft', 'incident_duration',
    'incident_report_produced'
]

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, excel_writer):
    """
    Trains a neural network, saves the model, generates evaluation files,
    and writes the confusion matrix to a shared Excel writer object.
    """
    print(f"--- Processing target: {target_name} ---")

    # --- Pre-split Data Cleaning for Stratification ---
    # Stratified split requires at least 2 members per class.
    value_counts = y.value_counts()
    single_sample_classes = value_counts[value_counts < 2].index

    if not single_sample_classes.empty:
        print(f"Warning for target '{target_name}': Removing classes with only 1 sample: {list(single_sample_classes)}")
        original_count = len(y)
        mask = ~y.isin(single_sample_classes)
        X = X[mask].copy()
        y = y[mask].copy()
        print(f"Removed {original_count - len(y)} rows.")

    if y.nunique() < 2:
        print(f"Skipping '{target_name}' because it has fewer than 2 valid classes after cleaning.\n")
        return

    # Identify categorical and numerical features from the provided X
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough' # Keep other columns if any
    )

    # Convert target variable to categorical codes for the model
    y_series = pd.Series(y).astype('category')
    y_codes = y_series.cat.codes
    class_names = y_series.cat.categories.tolist()

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_codes, test_size=0.2, random_state=42, stratify=y_codes)

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Handle class imbalance using SMOTE on the training data only
    min_class_samples = pd.Series(y_train).value_counts().min()
    if y_series.nunique() > 1 and min_class_samples > 1:
        k_neighbors = min(5, min_class_samples - 1)
        print(f"Applying SMOTE... Smallest class in training set has {min_class_samples} samples. Using k_neighbors={k_neighbors}.")
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    else:
        print(f"Skipping SMOTE for '{target_name}': smallest class in training set has {min_class_samples} sample(s).")
        X_train_resampled, y_train_resampled = X_train_processed, y_train

    # --- Build the Neural Network Model ---
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(class_names), activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # --- Train the Model ---
    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train_resampled, y_train_resampled,
              epochs=100,
              validation_split=0.2,
              callbacks=[early_stopping],
              verbose=0) # Set to 1 to see training progress

    # --- Save the Trained Model ---
    model_filename = f'model_{target_name}.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

    # --- Evaluate the Model ---
    y_pred_proba = model.predict(X_test_processed)
    y_pred = np.argmax(y_pred_proba, axis=1)

    # Define the full range of possible class labels
    all_class_labels = range(len(class_names))
    # Create the confusion matrix using all possible labels to ensure correct shape
    cm = confusion_matrix(y_test, y_pred, labels=all_class_labels)
    
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # --- Save Confusion Matrix as SVG ---
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    svg_filename = f'confusion_matrix_{target_name}.svg'
    plt.savefig(svg_filename, format='svg')
    plt.close()
    print(f"Confusion matrix plot saved as {svg_filename}")

    # --- Write Confusion Matrix to the shared Excel file ---
    # The sheet name is sanitized to be valid
    safe_sheet_name = f'CM_{target_name[:25]}'
    cm_df.to_excel(excel_writer, sheet_name=safe_sheet_name)
    print(f"Confusion matrix data added to Excel sheet: '{safe_sheet_name}'")

    # --- Save individual detailed report to a separate Excel file ---
    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y.loc[X_test.index]
    results_df['predicted_outcome'] = [class_names[i] for i in y_pred]
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions')
    print(f"Detailed report saved as {report_filename}")
    print("-" * 40 + "\n")


# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        print("Please ensure the file path is correct.")
        exit()

    # Drop rows where any of the target columns are missing
    df.dropna(subset=OUTPUT_COLUMNS, inplace=True)

    # Create a copy of input columns to modify
    processed_input_cols = INPUT_COLUMNS.copy()

    # Convert date/time columns to numerical features
    for col in ['incident_date', 'incident_time']:
        if col in df.columns and col in processed_input_cols:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_day'] = df[col].dt.day
            new_cols = [f'{col}_year', f'{col}_month', f'{col}_day']
            if col == 'incident_time':
                df[f'{col}_hour'] = df[col].dt.hour
                new_cols.append(f'{col}_hour')
            
            # Update the list of columns to be used as inputs
            processed_input_cols.remove(col)
            processed_input_cols.extend(new_cols)
    
    # Drop rows with NaT in date columns after coercion
    df.dropna(subset=processed_input_cols, inplace=True)
    
    # Define X *after* processing columns
    # Ensure all columns exist in the dataframe before selection
    final_input_cols = [col for col in processed_input_cols if col in df.columns]
    X = df[final_input_cols]

    # Create a single Excel writer for all confusion matrices
    excel_cm_filename = 'all_confusion_matrices.xlsx'
    with pd.ExcelWriter(excel_cm_filename, engine='openpyxl') as writer:
        print(f"Starting model training loop. All confusion matrices will be saved in '{excel_cm_filename}'.\n")
        # Loop through each target variable and train a model
        for target in OUTPUT_COLUMNS:
            if df[target].nunique() < 2:
                print(f"Skipping '{target}' because it has less than 2 unique values.")
                continue
            y = df[target]
            # Pass the writer object to the function
            train_and_evaluate_model(X, y, target, writer)

    print("✅ All models have been trained and evaluated.")

Starting model training loop. All confusion matrices will be saved in 'all_confusion_matrices.xlsx'.

--- Processing target: incident_type ---
Removed 1 rows.
Applying SMOTE... Smallest class in training set has 389 samples. Using k_neighbors=5.
Training the model...
Model saved as model_incident_type.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_type.svg
Confusion matrix data added to Excel sheet: 'CM_incident_type'
Detailed report saved as report_incident_type.xlsx
----------------------------------------

--- Processing target: incident_mechanism_1 ---
Removed 3 rows.
Applying SMOTE... Smallest class in training set has 2 samples. Using k_neighbors=1.
Training the model...
Model saved as model_incident_mechanism_1.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_1.svg
Confusion matrix data added to Excel sheet: 'CM_incident_mechanism_1'
Detailed report saved as report_incident_mechanism_1.xlsx
----------------------------------------

--- Processing target: incident_mechanism_2 ---
Removed 5 rows.
Applying SMOTE... Smallest class in training set has 2 samples. Using k_neighbors=1.
Training the model...
Model saved as model_incident_mechanism_2.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_2.svg
Confusion matrix data added to Excel sheet: 'CM_incident_mechanism_2'
Detailed report saved as report_incident_mechanism_2.xlsx
----------------------------------------

--- Processing target: incident_mechanism_3 ---
Removed 7 rows.
Applying SMOTE... Smallest class in training set has 2 samples. Using k_neighbors=1.
Training the model...



KeyboardInterrupt



In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os

# Try to import imblearn, provide install instructions if it fails
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("Error: The 'imbalanced-learn' library is required but not installed.")
    print("Please install it by running the following command in your terminal:")
    print("pip install imbalanced-learn")
    exit()


# --- Configuration ---
DATA_FILE = '../encoded_dam_data.csv' # Make sure this path is correct
INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'incident_date', 'incident_time',
    'incident_driver', 'owner_type', 'dam_type', 'primary_purpose_s', 'eap',
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres', 'year_completed',
    'latitude', 'longitude', 'year_modified'
]
OUTPUT_COLUMNS = [
    'incident_type', 'incident_mechanism_1', 'incident_mechanism_2',
    'incident_mechanism_3', 'eap_enacted_y_n_due_to_incident',
    'fatalities_number', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded', 'other_infrastructure_impacts',
    'response', 'volume_released_at_failure_ac_ft', 'incident_duration',
    'incident_report_produced'
]

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, summary_list):
    """
    Trains a neural network and generates evaluation files.
    If the target is binary, it appends a summary to the summary_list.
    """
    print(f"--- Processing target: {target_name} ---")

    # --- Pre-split Data Cleaning for Stratification ---
    value_counts = y.value_counts()
    single_sample_classes = value_counts[value_counts < 2].index

    if not single_sample_classes.empty:
        print(f"Warning for target '{target_name}': Removing classes with only 1 sample: {list(single_sample_classes)}")
        original_count = len(y)
        mask = ~y.isin(single_sample_classes)
        X = X[mask].copy()
        y = y[mask].copy()
        print(f"Removed {original_count - len(y)} rows.")

    if y.nunique() < 2:
        print(f"Skipping '{target_name}' because it has fewer than 2 valid classes after cleaning.\n")
        return

    # Identify features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Convert target to codes
    y_series = pd.Series(y).astype('category')
    y_codes = y_series.cat.codes
    class_names = y_series.cat.categories.tolist()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y_codes, test_size=0.2, random_state=42, stratify=y_codes)

    # Preprocess data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Handle class imbalance using SMOTE
    min_class_samples = pd.Series(y_train).value_counts().min()
    if y_series.nunique() > 1 and min_class_samples > 1:
        k_neighbors = min(5, min_class_samples - 1)
        print(f"Applying SMOTE... Using k_neighbors={k_neighbors}.")
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    else:
        print(f"Skipping SMOTE for '{target_name}'.")
        X_train_resampled, y_train_resampled = X_train_processed, y_train

    # --- Build and Train Model ---
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(class_names), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train_resampled, y_train_resampled, epochs=100, validation_split=0.2, callbacks=[early_stopping], verbose=0)

    # --- Save Model (Named by output) ---
    model_filename = f'{target_name}.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

    # --- Evaluate Model and Create Confusion Matrix ---
    y_pred = np.argmax(model.predict(X_test_processed), axis=1)
    all_class_labels = range(len(class_names))
    cm = confusion_matrix(y_test, y_pred, labels=all_class_labels)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # --- Save Confusion Matrix Plot ---
    svg_filename = f'confusion_matrix_{target_name}.svg'
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.savefig(svg_filename, format='svg')
    plt.close()
    print(f"Confusion matrix plot saved as {svg_filename}")

    # --- Add results to the summary report if classification is binary ---
    if len(class_names) == 2:
        # For a 2x2 matrix, ravel() provides [TN, FP, FN, TP]
        tn, fp, fn, tp = cm.ravel()
        summary_result = {
            'Output Name': target_name,
            'Actual 1, Predicted 1 (TP)': tp,
            'Actual 0, Predicted 0 (TN)': tn,
            'Actual 0, Predicted 1 (FP)': fp,
            'Actual 1, Predicted 0 (FN)': fn
        }
        summary_list.append(summary_result)
        print(f"✅ Added binary classification results for '{target_name}' to summary.")
    else:
        print(f"ℹ️ Skipping summary for '{target_name}' (not a binary classification).")

    # --- Save detailed individual report ---
    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y.loc[X_test.index]
    results_df['predicted_outcome'] = [class_names[i] for i in y_pred]
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions')
    print(f"Detailed report saved as {report_filename}")
    print("-" * 40 + "\n")


# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()

    df.dropna(subset=OUTPUT_COLUMNS, inplace=True)
    
    # Process date/time columns
    processed_input_cols = INPUT_COLUMNS.copy()
    for col in ['incident_date', 'incident_time']:
        if col in df.columns and col in processed_input_cols:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_day'] = df[col].dt.day
            new_cols = [f'{col}_year', f'{col}_month', f'{col}_day']
            if col == 'incident_time':
                df[f'{col}_hour'] = df[col].dt.hour
                new_cols.append(f'{col}_hour')
            processed_input_cols.remove(col)
            processed_input_cols.extend(new_cols)
    
    df.dropna(subset=processed_input_cols, inplace=True)
    
    final_input_cols = [col for col in processed_input_cols if col in df.columns]
    X = df[final_input_cols]

    # --- Initialize a list to hold summary results for binary models ---
    classification_summary_data = []

    # Loop through each target variable and train a model
    for target in OUTPUT_COLUMNS:
        if df[target].nunique() < 2:
            print(f"Skipping '{target}' because it has less than 2 unique values.")
            continue
        y = df[target]
        train_and_evaluate_model(X, y, target, classification_summary_data)

    # --- Save the consolidated binary classification summary to one Excel file ---
    if classification_summary_data:
        summary_df = pd.DataFrame(classification_summary_data)
        summary_filename = 'binary_classification_summary.xlsx'
        summary_df.to_excel(summary_filename, index=False)
        print(f"✅ All models trained. Binary summary saved to '{summary_filename}'.")
    else:
        print("✅ All models trained. No binary classification tasks were run, so no summary file was created.")

--- Processing target: incident_type ---
Removed 1 rows.
Applying SMOTE... Using k_neighbors=5.
Training the model...


  saving_api.save_model(


Model saved as incident_type.h5
Confusion matrix plot saved as confusion_matrix_incident_type.svg
✅ Added binary classification results for 'incident_type' to summary.
Detailed report saved as report_incident_type.xlsx
----------------------------------------

--- Processing target: incident_mechanism_1 ---
Removed 3 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as incident_mechanism_1.h5
Confusion matrix plot saved as confusion_matrix_incident_mechanism_1.svg
ℹ️ Skipping summary for 'incident_mechanism_1' (not a binary classification).
Detailed report saved as report_incident_mechanism_1.xlsx
----------------------------------------

--- Processing target: incident_mechanism_2 ---
Removed 5 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as incident_mechanism_2.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_2.svg
ℹ️ Skipping summary for 'incident_mechanism_2' (not a binary classification).
Detailed report saved as report_incident_mechanism_2.xlsx
----------------------------------------

--- Processing target: incident_mechanism_3 ---
Removed 7 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as incident_mechanism_3.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_3.svg
ℹ️ Skipping summary for 'incident_mechanism_3' (not a binary classification).
Detailed report saved as report_incident_mechanism_3.xlsx
----------------------------------------

--- Processing target: eap_enacted_y_n_due_to_incident ---
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as eap_enacted_y_n_due_to_incident.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_eap_enacted_y_n_due_to_incident.svg
ℹ️ Skipping summary for 'eap_enacted_y_n_due_to_incident' (not a binary classification).
Detailed report saved as report_eap_enacted_y_n_due_to_incident.xlsx
----------------------------------------

--- Processing target: fatalities_number ---
Removed 14 rows.
Skipping SMOTE for 'fatalities_number'.
Training the model...
Model saved as fatalities_number.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_fatalities_number.svg
ℹ️ Skipping summary for 'fatalities_number' (not a binary classification).
Detailed report saved as report_fatalities_number.xlsx
----------------------------------------

--- Processing target: number_of_people_evacuated ---
Removed 21 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as number_of_people_evacuated.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_number_of_people_evacuated.svg
ℹ️ Skipping summary for 'number_of_people_evacuated' (not a binary classification).
Detailed report saved as report_number_of_people_evacuated.xlsx
----------------------------------------

--- Processing target: number_of_habitable_structures_evacuated ---
Removed 5 rows.
Skipping SMOTE for 'number_of_habitable_structures_evacuated'.
Training the model...



KeyboardInterrupt



In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os

# Try to import imblearn
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("Error: The 'imbalanced-learn' library is required but not installed.")
    print("Please install it by running the following command in your terminal:")
    print("pip install imbalanced-learn")
    exit()


# --- Configuration ---
DATA_FILE = '../encoded_dam_data.csv'  # Make sure this path is correct
INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'incident_date', 'incident_time',
    'incident_driver', 'owner_type', 'dam_type', 'primary_purpose_s', 'eap',
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres', 'year_completed',
    'latitude', 'longitude', 'year_modified'
]
OUTPUT_COLUMNS = [
    'incident_type', 'incident_mechanism_1', 'incident_mechanism_2',
    'incident_mechanism_3', 'eap_enacted_y_n_due_to_incident',
    'fatalities_number', 'number_of_people_evacuated',
    'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded', 'other_infrastructure_impacts',
    'response', 'volume_released_at_failure_ac_ft', 'incident_duration',
    'incident_report_produced'
]

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, summary_list):
    """
    Trains a neural network and generates evaluation files.
    If the target is binary, it appends a summary to the summary_list.
    """
    print(f"--- Processing target: {target_name} ---")

    # --- Pre-split Data Cleaning ---
    value_counts = y.value_counts()
    single_sample_classes = value_counts[value_counts < 2].index

    if not single_sample_classes.empty:
        print(f"Warning for target '{target_name}': Removing classes with only 1 sample: {list(single_sample_classes)}")
        original_count = len(y)
        mask = ~y.isin(single_sample_classes)
        X = X[mask].copy()
        y = y[mask].copy()
        print(f"Removed {original_count - len(y)} rows.")

    if y.nunique() < 2:
        print(f"Skipping '{target_name}' because it has fewer than 2 valid classes after cleaning.\n")
        return

    # Identify features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Target encoding
    y_series = pd.Series(y).astype('category')
    y_codes = y_series.cat.codes
    class_names = y_series.cat.categories.tolist()

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_codes, test_size=0.2, random_state=42, stratify=y_codes
    )

    # Preprocess
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Handle imbalance
    min_class_samples = pd.Series(y_train).value_counts().min()
    if y_series.nunique() > 1 and min_class_samples > 1:
        k_neighbors = min(5, min_class_samples - 1)
        print(f"Applying SMOTE... Using k_neighbors={k_neighbors}.")
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    else:
        print(f"Skipping SMOTE for '{target_name}'.")
        X_train_resampled, y_train_resampled = X_train_processed, y_train

    # --- Build Model ---
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(class_names), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train_resampled, y_train_resampled, epochs=100,
              validation_split=0.2, callbacks=[early_stopping], verbose=0)

    # --- Save Model by Output Name ---
    model_filename = f'{target_name}.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

    # --- Evaluate Model ---
    y_pred = np.argmax(model.predict(X_test_processed), axis=1)
    cm = confusion_matrix(y_test, y_pred, labels=range(len(class_names)))
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # Save confusion matrix plot
    svg_filename = f'confusion_matrix_{target_name}.svg'
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.savefig(svg_filename, format='svg')
    plt.close()
    print(f"Confusion matrix plot saved as {svg_filename}")

    # --- Save Binary Summary ---
    if len(class_names) == 2:
        tn, fp, fn, tp = cm.ravel()
        summary_list.append({
            'Output Name': target_name,
            'Actual 1, Predicted 1 (TP)': tp,
            'Actual 0, Predicted 0 (TN)': tn,
            'Actual 0, Predicted 1 (FP)': fp,
            'Actual 1, Predicted 0 (FN)': fn
        })
        print(f"✅ Binary results for '{target_name}' added.")
    else:
        print(f"ℹ️ Skipping summary for '{target_name}' (not binary).")


# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()

    df.dropna(subset=OUTPUT_COLUMNS, inplace=True)

    # Process date/time
    processed_input_cols = INPUT_COLUMNS.copy()
    for col in ['incident_date', 'incident_time']:
        if col in df.columns and col in processed_input_cols:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_day'] = df[col].dt.day
            new_cols = [f'{col}_year', f'{col}_month', f'{col}_day']
            if col == 'incident_time':
                df[f'{col}_hour'] = df[col].dt.hour
                new_cols.append(f'{col}_hour')
            processed_input_cols.remove(col)
            processed_input_cols.extend(new_cols)

    df.dropna(subset=processed_input_cols, inplace=True)

    final_input_cols = [col for col in processed_input_cols if col in df.columns]
    X = df[final_input_cols]

    # Summary results
    classification_summary_data = []

    # Train/evaluate for each output
    for target in OUTPUT_COLUMNS:
        if df[target].nunique() < 2:
            print(f"Skipping '{target}' (less than 2 unique values).")
            continue
        y = df[target]
        train_and_evaluate_model(X, y, target, classification_summary_data)

    # Save consolidated summary
    if classification_summary_data:
        summary_df = pd.DataFrame(classification_summary_data)
        summary_filename = 'binary_classification_summary.xlsx'
        summary_df.to_excel(summary_filename, index=False)
        print(f"✅ All binary summaries saved to '{summary_filename}'.")
    else:
        print("✅ No binary outputs, no summary file created.")


--- Processing target: incident_type ---
Removed 1 rows.
Applying SMOTE... Using k_neighbors=5.
Training the model...
Model saved as incident_type.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_type.svg
✅ Binary results for 'incident_type' added.
--- Processing target: incident_mechanism_1 ---
Removed 3 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as incident_mechanism_1.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_1.svg
ℹ️ Skipping summary for 'incident_mechanism_1' (not binary).
--- Processing target: incident_mechanism_2 ---
Removed 5 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as incident_mechanism_2.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_2.svg
ℹ️ Skipping summary for 'incident_mechanism_2' (not binary).
--- Processing target: incident_mechanism_3 ---
Removed 7 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as incident_mechanism_3.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_incident_mechanism_3.svg
ℹ️ Skipping summary for 'incident_mechanism_3' (not binary).
--- Processing target: eap_enacted_y_n_due_to_incident ---
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as eap_enacted_y_n_due_to_incident.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_eap_enacted_y_n_due_to_incident.svg
ℹ️ Skipping summary for 'eap_enacted_y_n_due_to_incident' (not binary).
--- Processing target: fatalities_number ---
Removed 14 rows.
Skipping SMOTE for 'fatalities_number'.
Training the model...
Model saved as fatalities_number.h5


  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_fatalities_number.svg
ℹ️ Skipping summary for 'fatalities_number' (not binary).
--- Processing target: number_of_people_evacuated ---
Removed 21 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...
Model saved as number_of_people_evacuated.h5
1/9 [==>...........................] - ETA: 0s

  saving_api.save_model(


Confusion matrix plot saved as confusion_matrix_number_of_people_evacuated.svg
ℹ️ Skipping summary for 'number_of_people_evacuated' (not binary).
--- Processing target: number_of_habitable_structures_evacuated ---
Removed 5 rows.
Skipping SMOTE for 'number_of_habitable_structures_evacuated'.
Training the model...



KeyboardInterrupt

