In [None]:
# the frist classifier model that work write but dont give us the resualt format we want 

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os

# Try to import imblearn, provide install instructions if it fails
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("Error: The 'imbalanced-learn' library is required but not installed.")
    print("Please install it by running the following command in your terminal:")
    print("pip install imbalanced-learn")
    exit()


# --- Configuration ---
# Using the pre-processed classifier data file
DATA_FILE = '../classifier_data.csv' 

# This list defines all possible target columns in the file
CLASSIFIER_TARGET_COLUMNS = [
    'incident_type', 'incident_mechanism_1', 'incident_mechanism_2',
    'incident_mechanism_3', 'eap_enacted_y_n_due_to_incident',
    'fatalities_number', 'other_infrastructure_impacts', 'response',
    'incident_report_produced'
]

# --- Main Processing Function (No changes needed here) ---
def train_and_evaluate_model(X, y, target_name, summary_list):
    """
    Trains a neural network and generates evaluation files.
    If the target is binary, it appends a summary to the summary_list.
    """
    print(f"--- Processing target: {target_name} ---")

    # --- Pre-split Data Cleaning for Stratification ---
    value_counts = y.value_counts()
    single_sample_classes = value_counts[value_counts < 2].index

    if not single_sample_classes.empty:
        print(f"Warning for target '{target_name}': Removing classes with only 1 sample: {list(single_sample_classes)}")
        original_count = len(y)
        mask = ~y.isin(single_sample_classes)
        X = X[mask].copy()
        y = y[mask].copy()
        print(f"Removed {original_count - len(y)} rows.")

    if y.nunique() < 2:
        print(f"Skipping '{target_name}' because it has fewer than 2 valid classes after cleaning.\n")
        return

    # Identify features
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Convert target to codes
    y_series = pd.Series(y).astype('category')
    y_codes = y_series.cat.codes
    class_names = y_series.cat.categories.tolist()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y_codes, test_size=0.2, random_state=42, stratify=y_codes)

    # Preprocess data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Handle class imbalance using SMOTE
    min_class_samples = pd.Series(y_train).value_counts().min()
    if y_series.nunique() > 1 and min_class_samples > 1:
        k_neighbors = min(5, min_class_samples - 1)
        print(f"Applying SMOTE... Using k_neighbors={k_neighbors}.")
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    else:
        print(f"Skipping SMOTE for '{target_name}'.")
        X_train_resampled, y_train_resampled = X_train_processed, y_train

    # --- Build and Train Model ---
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(class_names), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train_resampled, y_train_resampled, epochs=100, validation_split=0.2, callbacks=[early_stopping], verbose=0)

    # --- Save Model (Named by output) ---
    model_filename = f'{target_name}.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

    # --- Evaluate Model and Create Confusion Matrix ---
    y_pred = np.argmax(model.predict(X_test_processed), axis=1)
    all_class_labels = range(len(class_names))
    cm = confusion_matrix(y_test, y_pred, labels=all_class_labels)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # --- Save Confusion Matrix Plot ---
    svg_filename = f'confusion_matrix_{target_name}.svg'
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.savefig(svg_filename, format='svg')
    plt.close()
    print(f"Confusion matrix plot saved as {svg_filename}")

    # --- Add results to the summary report if classification is binary ---
    if len(class_names) == 2:
        tn, fp, fn, tp = cm.ravel()
        summary_result = {
            'Output Name': target_name,
            'Actual 1, Predicted 1 (TP)': tp,
            'Actual 0, Predicted 0 (TN)': tn,
            'Actual 0, Predicted 1 (FP)': fp,
            'Actual 1, Predicted 0 (FN)': fn
        }
        summary_list.append(summary_result)
        print(f"✅ Added binary classification results for '{target_name}' to summary.")
    else:
        print(f"ℹ️ Skipping summary for '{target_name}' (not a binary classification).")

    # --- Save detailed individual report ---
    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y.loc[X_test.index]
    results_df['predicted_outcome'] = [class_names[i] for i in y_pred]
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions')
    print(f"Detailed report saved as {report_filename}")
    print("-" * 40 + "\n")


# --- Main Execution (FIXED) ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()

    # Define the input columns dynamically by excluding all possible target columns
    # This is more robust than maintaining a separate input column list.
    input_cols = [col for col in df.columns if col not in CLASSIFIER_TARGET_COLUMNS]
    X = df[input_cols]
    
    # --- Initialize a list to hold summary results for binary models ---
    classification_summary_data = []

    # Loop through each CLASSIFIER target variable and train a model
    for target in CLASSIFIER_TARGET_COLUMNS:
        # Check if the target column actually exists in the dataframe
        if target not in df.columns:
            print(f"Warning: Target column '{target}' not found in the data file. Skipping.")
            continue

        # Drop rows where the current target is missing
        temp_df = df.dropna(subset=[target])
        X_filtered = temp_df[input_cols]
        y = temp_df[target]

        if y.nunique() < 2:
            print(f"Skipping '{target}' because it has less than 2 unique values.")
            continue
            
        train_and_evaluate_model(X_filtered, y, target, classification_summary_data)

    # --- Save the consolidated binary classification summary to one Excel file ---
    if classification_summary_data:
        summary_df = pd.DataFrame(classification_summary_data)
        summary_filename = 'binary_classification_summary.xlsx'
        summary_df.to_excel(summary_filename, index=False)
        print(f"✅ All models trained. Binary summary saved to '{summary_filename}'.")
    else:
        print("✅ All models trained. No binary classification tasks were run, so no summary file was created.")

--- Processing target: incident_type ---
Removed 1 rows.
Applying SMOTE... Using k_neighbors=5.


Training the model...




  saving_api.save_model(


Model saved as incident_type.h5
Confusion matrix plot saved as confusion_matrix_incident_type.svg
✅ Added binary classification results for 'incident_type' to summary.
Detailed report saved as report_incident_type.xlsx
----------------------------------------

--- Processing target: incident_mechanism_1 ---
Removed 3 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as incident_mechanism_1.h5
Confusion matrix plot saved as confusion_matrix_incident_mechanism_1.svg
ℹ️ Skipping summary for 'incident_mechanism_1' (not a binary classification).
Detailed report saved as report_incident_mechanism_1.xlsx
----------------------------------------

--- Processing target: incident_mechanism_2 ---
Removed 5 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as incident_mechanism_2.h5



KeyboardInterrupt



Error in callback <function flush_figures at 0x000001E5CD76B920> (for post_execute):



KeyboardInterrupt



In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os

# Try to import imblearn, provide install instructions if it fails
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("Error: The 'imbalanced-learn' library is required but not installed.")
    print("Please install it by running the following command in your terminal:")
    print("pip install imbalanced-learn")
    exit()


# --- Configuration ---
DATA_FILE = '../classifier_data.csv' 

CLASSIFIER_TARGET_COLUMNS = [
    'incident_type', 'incident_mechanism_1', 'incident_mechanism_2',
    'incident_mechanism_3', 'eap_enacted_y_n_due_to_incident',
    'fatalities_number', 'other_infrastructure_impacts', 'response',
    'incident_report_produced'
]

# --- Main Processing Function ---
def train_and_evaluate_model(X, y, target_name, summary_list, metrics_list):
    """
    Trains a neural network, generates evaluation files, and collects performance metrics.
    """
    print(f"--- Processing target: {target_name} ---")

    # --- Pre-split Data Cleaning ---
    value_counts = y.value_counts()
    single_sample_classes = value_counts[value_counts < 2].index

    if not single_sample_classes.empty:
        print(f"Warning for target '{target_name}': Removing classes with only 1 sample: {list(single_sample_classes)}")
        mask = ~y.isin(single_sample_classes)
        X = X[mask].copy()
        y = y[mask].copy()
        print(f"Removed {len(single_sample_classes)} rows.")

    if y.nunique() < 2:
        print(f"Skipping '{target_name}' because it has fewer than 2 valid classes.\n")
        return

    # --- Preprocessing ---
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    numerical_features = X.select_dtypes(include=np.number).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )
    y_series = pd.Series(y).astype('category')
    y_codes = y_series.cat.codes
    class_names = y_series.cat.categories.tolist()
    X_train, X_test, y_train, y_test = train_test_split(X, y_codes, test_size=0.2, random_state=42, stratify=y_codes)
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # --- SMOTE for Imbalanced Data ---
    min_class_samples = pd.Series(y_train).value_counts().min()
    if y_series.nunique() > 1 and min_class_samples > 1:
        k_neighbors = min(5, min_class_samples - 1)
        print(f"Applying SMOTE... Using k_neighbors={k_neighbors}.")
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    else:
        print(f"Skipping SMOTE for '{target_name}'.")
        X_train_resampled, y_train_resampled = X_train_processed, y_train

    # --- Build and Train Model ---
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_resampled.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(class_names), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print("Training the model...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train_resampled, y_train_resampled, epochs=100, validation_split=0.2, callbacks=[early_stopping], verbose=0)
    model_filename = f'{target_name}.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

    # --- Evaluation ---
    y_pred = np.argmax(model.predict(X_test_processed), axis=1)

    # --- NEW: Calculate and Collect Performance Metrics ---
    accuracy = accuracy_score(y_test, y_pred)
    # Use 'weighted' average for multi-class precision, recall, and F1
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    metrics_result = {
        'Model Output': target_name,
        'Accuracy': accuracy,
        'Precision (Weighted)': precision,
        'Recall (Weighted)': recall,
        'F1-Score (Weighted)': f1
    }
    metrics_list.append(metrics_result)
    print(f"✅ Performance metrics for '{target_name}' collected.")

    # --- Generate Other Reports ---
    all_class_labels = range(len(class_names))
    cm = confusion_matrix(y_test, y_pred, labels=all_class_labels)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
    
    svg_filename = f'confusion_matrix_{target_name}.svg'
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {target_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.savefig(svg_filename, format='svg')
    plt.close()
    print(f"Confusion matrix plot saved as {svg_filename}")

    # Add results to the binary summary report
    if len(class_names) == 2:
        tn, fp, fn, tp = cm.ravel()
        summary_result = {
            'Output Name': target_name,
            'Actual 1, Predicted 1 (TP)': tp,
            'Actual 0, Predicted 0 (TN)': tn,
            'Actual 0, Predicted 1 (FP)': fp,
            'Actual 1, Predicted 0 (FN)': fn
        }
        summary_list.append(summary_result)
        print(f"✅ Binary classification results for '{target_name}' added to summary.")
    
    # Save detailed individual report
    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y.loc[X_test.index]
    results_df['predicted_outcome'] = [class_names[i] for i in y_pred]
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions')
    print(f"Detailed report saved as {report_filename}")
    print("-" * 40 + "\n")


# --- Main Execution ---
if __name__ == "__main__":
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()

    input_cols = [col for col in df.columns if col not in CLASSIFIER_TARGET_COLUMNS]
    X = df[input_cols]
    
    # Initialize lists to hold summary data from all models
    binary_summary_data = []
    model_metrics_data = [] # NEW list for performance metrics

    # Loop through each CLASSIFIER target variable
    for target in CLASSIFIER_TARGET_COLUMNS:
        if target not in df.columns:
            print(f"Warning: Target column '{target}' not found. Skipping.")
            continue

        temp_df = df.dropna(subset=[target])
        X_filtered = temp_df[input_cols]
        y = temp_df[target]

        if y.nunique() < 2:
            print(f"Skipping '{target}' because it has less than 2 unique values.")
            continue
            
        train_and_evaluate_model(X_filtered, y, target, binary_summary_data, model_metrics_data)

    # --- Save Consolidated Reports ---
    if binary_summary_data:
        summary_df = pd.DataFrame(binary_summary_data)
        summary_filename = 'binary_classification_summary.xlsx'
        summary_df.to_excel(summary_filename, index=False)
        print(f"✅ Binary summary saved to '{summary_filename}'.")
    else:
        print("ℹ️ No binary classification tasks were run, so no binary summary file was created.")

    # NEW: Save the consolidated performance metrics to one Excel file
    if model_metrics_data:
        metrics_df = pd.DataFrame(model_metrics_data)
        metrics_filename = 'model_performance_metrics.xlsx'
        metrics_df.to_excel(metrics_filename, index=False)
        print(f"✅ Performance metrics for all models saved to '{metrics_filename}'.")

    print("\nAll tasks complete.")

--- Processing target: incident_type ---
Removed 1 rows.
Applying SMOTE... Using k_neighbors=5.
Training the model...


  saving_api.save_model(


Model saved as incident_type.h5
✅ Performance metrics for 'incident_type' collected.
Confusion matrix plot saved as confusion_matrix_incident_type.svg
✅ Binary classification results for 'incident_type' added to summary.
Detailed report saved as report_incident_type.xlsx
----------------------------------------

--- Processing target: incident_mechanism_1 ---
Removed 3 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as incident_mechanism_1.h5
✅ Performance metrics for 'incident_mechanism_1' collected.
Confusion matrix plot saved as confusion_matrix_incident_mechanism_1.svg
Detailed report saved as report_incident_mechanism_1.xlsx
----------------------------------------

--- Processing target: incident_mechanism_2 ---
Removed 5 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as incident_mechanism_2.h5
✅ Performance metrics for 'incident_mechanism_2' collected.
Confusion matrix plot saved as confusion_matrix_incident_mechanism_2.svg
Detailed report saved as report_incident_mechanism_2.xlsx
----------------------------------------

--- Processing target: incident_mechanism_3 ---
Removed 7 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as incident_mechanism_3.h5
✅ Performance metrics for 'incident_mechanism_3' collected.
Confusion matrix plot saved as confusion_matrix_incident_mechanism_3.svg
Detailed report saved as report_incident_mechanism_3.xlsx
----------------------------------------

--- Processing target: eap_enacted_y_n_due_to_incident ---
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as eap_enacted_y_n_due_to_incident.h5
✅ Performance metrics for 'eap_enacted_y_n_due_to_incident' collected.
Confusion matrix plot saved as confusion_matrix_eap_enacted_y_n_due_to_incident.svg
Detailed report saved as report_eap_enacted_y_n_due_to_incident.xlsx
----------------------------------------

--- Processing target: fatalities_number ---
Removed 14 rows.
Skipping SMOTE for 'fatalities_number'.
Training the model...


  saving_api.save_model(


Model saved as fatalities_number.h5
✅ Performance metrics for 'fatalities_number' collected.
Confusion matrix plot saved as confusion_matrix_fatalities_number.svg
Detailed report saved as report_fatalities_number.xlsx
----------------------------------------

--- Processing target: other_infrastructure_impacts ---
Removed 7 rows.
Applying SMOTE... Using k_neighbors=1.
Training the model...


  saving_api.save_model(


Model saved as other_infrastructure_impacts.h5
✅ Performance metrics for 'other_infrastructure_impacts' collected.
Confusion matrix plot saved as confusion_matrix_other_infrastructure_impacts.svg
Detailed report saved as report_other_infrastructure_impacts.xlsx
----------------------------------------

--- Processing target: response ---
Removed 204 rows.
Skipping SMOTE for 'response'.
Training the model...


  saving_api.save_model(


Model saved as response.h5
✅ Performance metrics for 'response' collected.
Confusion matrix plot saved as confusion_matrix_response.svg
Detailed report saved as report_response.xlsx
----------------------------------------

--- Processing target: incident_report_produced ---
Removed 1 rows.
Applying SMOTE... Using k_neighbors=3.
Training the model...


  saving_api.save_model(


Model saved as incident_report_produced.h5
✅ Performance metrics for 'incident_report_produced' collected.
Confusion matrix plot saved as confusion_matrix_incident_report_produced.svg
Detailed report saved as report_incident_report_produced.xlsx
----------------------------------------

✅ Binary summary saved to 'binary_classification_summary.xlsx'.
✅ Performance metrics for all models saved to 'model_performance_metrics.xlsx'.

All tasks complete.
