In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer

In [2]:
partition = 200

In [3]:
trainpath = f'../../../../../data/top30groups/OneHotLongLatCombined/scaledtrain1/train{partition}.csv'
testpath = f'../../../../../data/top30groups/OneHotLongLatCombined/scaledtest1/test{partition}.csv'

traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

if 'attack_date' in traindata.columns:
    traindata = traindata.drop(columns=['attack_date'])

if 'attack_date' in testdata.columns:
    testdata = testdata.drop(columns=['attack_date'])

    print(f'shape train data: ', traindata.shape)
    print(f'shape test data: ', testdata.shape)

In [4]:
testdata.shape

(1800, 3174)

In [5]:
traindata.shape

(4200, 3174)

In [6]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def split_data(dftrain, dftest):
    Xtrain = dftrain.drop(columns=['gname']).values
    Ytrain = dftrain['gname'].values
    Xtest = dftest.drop(columns=['gname']).values
    Ytest = dftest['gname'].values

    # Encode labels as integers
    le = LabelEncoder()
    Ytrain = le.fit_transform(Ytrain)
    Ytest = le.transform(Ytest)

    Xtrain = Xtrain.astype(float)
    Xtest = Xtest.astype(float)

    # Convert to torch tensors and move to GPU
    Xtrain = torch.tensor(Xtrain, dtype=torch.float32).to("cuda")
    Ytrain = torch.tensor(Ytrain, dtype=torch.long).to("cuda")
    Xtest = torch.tensor(Xtest, dtype=torch.float32).to("cuda")
    Ytest = torch.tensor(Ytest, dtype=torch.long).to("cuda")

    return Xtrain, Ytrain, Xtest, Ytest, le


In [7]:
from sklearn.model_selection import ParameterSampler
import torch.nn as nn
import torch.optim as optim
import random

class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, activation='relu'):
        super().__init__()
        act_fn = nn.ReLU() if activation == 'relu' else nn.Tanh()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            act_fn,
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

def train_model(model, Xtrain, Ytrain, lr, alpha, max_epochs=100):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=alpha)

    for epoch in range(max_epochs):
        model.train()
        optimizer.zero_grad()
        output = model(Xtrain)
        loss = criterion(output, Ytrain)
        loss.backward()
        optimizer.step()

def evaluate_model(model, Xval, Yval):
    model.eval()
    with torch.no_grad():
        pred = model(Xval).argmax(dim=1)
        acc = (pred == Yval).float().mean().item()
    return acc

def find_best_mlp(Xtrain, Ytrain, num_classes, n_iter=20, max_epochs=100):
    input_dim = Xtrain.shape[1]

    param_dist = {
        'hidden_dim': [10, 50, 100, 150, 200, 300],
        'activation': ['relu', 'tanh'],
        'lr': [0.0001, 0.001, 0.01],
        'alpha': [1e-5, 1e-4, 1e-3, 1e-2]
    }

    best_acc = -1
    best_params = None

    for params in list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42)):
        # Random 80/20 split for validation
        indices = torch.randperm(Xtrain.size(0))
        split = int(0.8 * len(indices))
        train_idx, val_idx = indices[:split], indices[split:]

        model = SimpleMLP(input_dim, params['hidden_dim'], num_classes, params['activation']).to("cuda")
        train_model(model, Xtrain[train_idx], Ytrain[train_idx],
                    lr=params['lr'], alpha=params['alpha'], max_epochs=max_epochs)

        acc = evaluate_model(model, Xtrain[val_idx], Ytrain[val_idx])
        if acc > best_acc:
            best_acc = acc
            best_params = params

    # ✅ Retrain best model on the full training set
    final_model = SimpleMLP(input_dim, best_params['hidden_dim'], num_classes, best_params['activation']).to("cuda")
    train_model(final_model, Xtrain, Ytrain, lr=best_params['lr'], alpha=best_params['alpha'], max_epochs=max_epochs)

    print(f"Best accuracy on validation split: {best_acc * 100:.2f}%")
    print("Best hyperparameters:", best_params)

    return final_model


In [8]:
traindata.dtypes

extended                            float64
vicinity                            float64
multiple                            float64
success                             float64
suicide                             float64
                                     ...   
longlat_(126.300309, 8.892579)         bool
longlat_(126.30035, 8.801867)          bool
longlat_(126.314871, 8.217759)         bool
longlat_(151.179691, -33.934491)       bool
gname                                object
Length: 3174, dtype: object

In [9]:
Xtrain, Ytrain, Xtest, Ytest, le = split_data(traindata, testdata)
best_mlp = find_best_mlp(Xtrain, Ytrain, 30)

best_mlp.eval()
with torch.no_grad():
    y_pred = best_mlp(Xtest).argmax(dim=1)
    acc = (y_pred == Ytest).float().mean().item()
    print(f"Accuracy: {acc * 100:.2f}%")


Best accuracy on validation split: 56.67%
Best hyperparameters: {'lr': 0.01, 'hidden_dim': 300, 'alpha': 0.001, 'activation': 'tanh'}
Accuracy: 58.61%


In [10]:
y_true_decoded = le.inverse_transform(Ytest.cpu().numpy())
y_pred_decoded = le.inverse_transform(y_pred.cpu().numpy())

In [11]:
import os
file_path = os.path.join("results", f"gtd{partition}.txt")

# Make sure the directory exists
os.makedirs("results", exist_ok=True)

# Write a string to the file
with open(file_path, "w") as file:
    file.write(f"Accuracy: {acc:.4f}\n")
    file.write(f"Precision: {precision_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")
    file.write(f"Recall: {recall_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")
    file.write(f"F1 Score: {f1_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")


In [12]:
print(classification_report(y_true_decoded, y_pred_decoded))

                                                  precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.46      0.70      0.55        60
        African National Congress (South Africa)       0.50      0.92      0.65        60
                                Al-Qaida in Iraq       0.66      0.68      0.67        60
        Al-Qaida in the Arabian Peninsula (AQAP)       0.73      0.60      0.66        60
                                      Al-Shabaab       0.82      0.67      0.73        60
             Basque Fatherland and Freedom (ETA)       0.84      0.70      0.76        60
                                      Boko Haram       0.61      0.47      0.53        60
  Communist Party of India - Maoist (CPI-Maoist)       0.50      0.38      0.43        60
       Corsican National Liberation Front (FLNC)       0.68      0.98      0.80        60
                       Donetsk People's Republic       0.73      0.75      0.74        60
Farabundo

In [13]:
print(best_mlp)

SimpleMLP(
  (model): Sequential(
    (0): Linear(in_features=3173, out_features=300, bias=True)
    (1): Tanh()
    (2): Linear(in_features=300, out_features=30, bias=True)
  )
)


In [14]:
def plot_confusion_matrix(y_true, y_pred, labels):
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)

    plt.figure(figsize=(18, 16))
    sns.heatmap(cm_normalized,
                annot=True,
                fmt=".2f",
                xticklabels=labels,
                yticklabels=labels,
                cmap="viridis",
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8})

    plt.title(f"Normalized Confusion Matrix (Partition {partition})", fontsize=18)
    plt.xlabel("Predicted Label", fontsize=14)
    plt.ylabel("True Label", fontsize=14)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save the figure
    save_path = f"results/confusion_matrix_partition_{partition}.png"
    plt.savefig(save_path, dpi=300)
    plt.close()

    print(f"Saved confusion matrix for partition {partition} to {save_path}")


In [15]:

# Get all unique class labels from the truths
class_labels = np.unique(y_true_decoded)

plot_confusion_matrix(y_true_decoded, y_pred_decoded, labels=class_labels)



Saved confusion matrix for partition 200 to results/confusion_matrix_partition_200.png
