# Disaster Classification using Neural Network and XGBoost

This notebook demonstrates a machine learnings that uses deep neural networks for feature extraction and XGBoost for final classification. It focuses on classifying disaster types (Wildfire, Flood, Earthquake) using the EM-DAT dataset.

## Step 1: Data Preprocessing

In [None]:
# Step 1: Data Loading, Cleaning, Encoding, Scaling, Balancing
# Run this script first
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

file_path = "E:/Chula University/Disaster Paper/emdat.xlsx"
data = pd.read_excel(file_path)

relevant_disasters = ["Wildfire", "Flood", "Earthquake"]
filtered_data = data[data["Disaster Type"].isin(relevant_disasters)]

num_features = filtered_data.select_dtypes(include=["float64", "int64"]).columns
cat_features = filtered_data.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="median")
filtered_data[num_features] = num_imputer.fit_transform(filtered_data[num_features])
cat_imputer = SimpleImputer(strategy="most_frequent")
filtered_data[cat_features] = cat_imputer.fit_transform(filtered_data[cat_features])

label_encoders = {}
for col in cat_features:
    label_encoders[col] = LabelEncoder()
    filtered_data[col] = label_encoders[col].fit_transform(filtered_data[col])

scaler = StandardScaler()
filtered_data[num_features] = scaler.fit_transform(filtered_data[num_features])

X = filtered_data.drop("Disaster Type", axis=1)
y = filtered_data["Disaster Type"]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

import torch
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

# Save for later steps
torch.save((X_train_tensor, y_train_tensor, X_test_tensor, X_train, X_test, y_train, y_test, y_resampled), "processed_data.pt")

## Step 2: Neural Network Feature Extractor

In [None]:
# Step 2: Train Neural Network Feature Extractor
# Run after 1_data_preprocessing.py
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm

X_train_tensor, y_train_tensor, X_test_tensor, X_train, X_test, y_train, y_test, y_resampled = torch.load("processed_data.pt")

class EnhancedFeatureExtractor(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EnhancedFeatureExtractor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.act1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.act2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.4)
        self.fc3 = nn.Linear(hidden_size, hidden_size // 2)

    def forward(self, x):
        x = self.act1(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.act2(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

hidden_size = 128
feature_extractor = EnhancedFeatureExtractor(input_size=X_train_tensor.shape[1], hidden_size=hidden_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(feature_extractor.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
batch_size = 64
num_epochs = 50

train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in tqdm(range(num_epochs), desc="Training Epochs"):
    feature_extractor.train()
    for batch in train_loader:
        X_batch, y_batch = batch
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        embeddings = feature_extractor(X_batch)
        outputs = nn.Linear(hidden_size // 2, len(np.unique(y_train_tensor.cpu().numpy()))).to(device)(embeddings)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()

feature_extractor.eval()
with torch.no_grad():
    X_train_embeddings = feature_extractor(X_train_tensor.to(device)).cpu().numpy()
    X_test_embeddings = feature_extractor(X_test_tensor.to(device)).cpu().numpy()

np.savez("embeddings.npz", X_train=X_train_embeddings, X_test=X_test_embeddings)
torch.save(feature_extractor.state_dict(), "feature_extractor.pth")

## Step 3: XGBoost Classifier with GridSearch

In [None]:
# Step 3: Train XGBoost on extracted embeddings
# Run after 2_train_feature_extractor.py
import numpy as np
import torch
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

embeddings = np.load("embeddings.npz")
X_train_embeddings, X_test_embeddings = embeddings["X_train"], embeddings["X_test"]
_, _, _, _, _, y_train, y_test, y_resampled = torch.load("processed_data.pt")
relevant_disasters = ["Wildfire", "Flood", "Earthquake"]

param_grid = {
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [5, 7, 9],
    "n_estimators": [100, 200, 300]
}
xgb_model = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"),
    param_grid,
    scoring="accuracy",
    cv=3,
    verbose=1
)
xgb_model.fit(X_train_embeddings, y_train)
y_pred = xgb_model.predict(X_test_embeddings)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=relevant_disasters))

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=relevant_disasters, yticklabels=relevant_disasters)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()