In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer

In [2]:
partition = 478

In [3]:
trainpath = f'../../../../../data/top30groups/LongLatCombined/scaledtrain1/train{partition}.csv'
testpath = f'../../../../../data/top30groups/LongLatCombined/scaledtest1/test{partition}.csv'

traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [4]:
testdata.shape

(4320, 16)

In [5]:
traindata.shape

(10020, 16)

In [6]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def split_data(dftrain, dftest):
    Xtrain = dftrain.drop(columns=['gname']).values
    Ytrain = dftrain['gname'].values
    Xtest = dftest.drop(columns=['gname']).values
    Ytest = dftest['gname'].values

    # Encode labels as integers
    le = LabelEncoder()
    Ytrain = le.fit_transform(Ytrain)
    Ytest = le.transform(Ytest)

    Xtrain = Xtrain.astype(float)
    Xtest = Xtest.astype(float)

    # Convert to torch tensors and move to GPU
    Xtrain = torch.tensor(Xtrain, dtype=torch.float32).to("cuda")
    Ytrain = torch.tensor(Ytrain, dtype=torch.long).to("cuda")
    Xtest = torch.tensor(Xtest, dtype=torch.float32).to("cuda")
    Ytest = torch.tensor(Ytest, dtype=torch.long).to("cuda")

    return Xtrain, Ytrain, Xtest, Ytest, le


In [7]:
torch.cuda.empty_cache()


In [8]:
from sklearn.model_selection import ParameterSampler
import torch.nn as nn
import torch.optim as optim
import random
import time

class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden1, hidden2, output_dim, activation='relu'):
        super().__init__()
        act_fn = nn.ReLU() if activation == 'relu' else nn.Tanh()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden1),
            act_fn,
            nn.Linear(hidden1, hidden2),
            act_fn,
            nn.Linear(hidden2, output_dim)
        )

    def forward(self, x):
        return self.model(x)

def train_model(model, Xtrain, Ytrain, lr, alpha, searching=False, max_epochs=1000):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=alpha)

    epoch_times = []
    train_accuracies = []

    best_acc = -1
    best_epoch = -1
    best_state_dict = None

    for epoch in range(max_epochs):
        start_time = time.time()

        # Training step
        model.train()
        optimizer.zero_grad()
        output = model(Xtrain)
        loss = criterion(output, Ytrain)
        loss.backward()
        optimizer.step()

        # Accuracy on full training set
        if not searching:
            model.eval()
            with torch.no_grad():
                pred = output.argmax(dim=1)
                acc = (pred == Ytrain).float().mean().item()
                train_accuracies.append(acc)

                if acc > best_acc:
                    best_acc = acc
                    best_epoch = epoch
                    best_state_dict = model.state_dict()

            end_time = time.time()
            epoch_times.append(end_time - start_time)

            print(f"Epoch {epoch+1:03d}: loss = {loss.item():.4f}, acc = {acc:.4f}, time = {end_time - start_time:.3f}s")

    # Restore best model weights
    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    if not searching:
        print(f"best epoch: {best_epoch} Best acc: {best_acc}")

    return model, epoch_times, train_accuracies, best_epoch, best_acc



def evaluate_model(model, Xval, Yval):
    model.eval()
    with torch.no_grad():
        pred = model(Xval).argmax(dim=1)
        acc = (pred == Yval).float().mean().item()
    return acc

def find_best_mlp(Xtrain, Ytrain, num_classes, n_iter=20, max_epochs=1000):
    input_dim = Xtrain.shape[1]

    param_dist = {
        'hidden1': [50, 100, 150, 200],
        'hidden2': [25, 50, 100],
        'activation': ['relu', 'tanh'],
        'lr': [0.0001, 0.001, 0.01],
        'alpha': [1e-5, 1e-4, 1e-3, 1e-2]
    }

    best_acc = -1
    best_params = None

    for params in list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42)):
        indices = torch.randperm(Xtrain.size(0))
        split = int(0.8 * len(indices))
        train_idx, val_idx = indices[:split], indices[split:]

        model = SimpleMLP(
            input_dim=input_dim,
            hidden1=params['hidden1'],
            hidden2=params['hidden2'],
            output_dim=num_classes,
            activation=params['activation']
        ).to("cuda")

        _ = train_model(model, Xtrain[train_idx], Ytrain[train_idx],
                    lr=params['lr'], alpha=params['alpha'], searching=True, max_epochs=max_epochs)

        acc = evaluate_model(model, Xtrain[val_idx], Ytrain[val_idx])
        if acc > best_acc:
            best_acc = acc
            best_params = params

    
    final_model = SimpleMLP(
        input_dim=input_dim,
        hidden1=best_params['hidden1'],
        hidden2=best_params['hidden2'],
        output_dim=num_classes,
        activation=best_params['activation']
    ).to("cuda")

    _, epoch_times, train_accuracies, best_epoch, best_acc = train_model(final_model, Xtrain, Ytrain,
                lr=best_params['lr'], alpha=best_params['alpha'], searching=False, max_epochs=max_epochs)

    print(f"Best accuracy on validation split: {best_acc * 100:.2f}%")
    print("Best hyperparameters:", best_params)

    return final_model, epoch_times

In [9]:
import torch.nn.functional as F
Xtrain, Ytrain, Xtest, Ytest, le = split_data(traindata, testdata)
best_mlp, epoch_times = find_best_mlp(Xtrain, Ytrain, 30)

best_mlp.eval()
with torch.no_grad():
    logits = best_mlp(Xtest)
    y_pred = logits.argmax(dim=1)
    acc = (y_pred == Ytest).float().mean().item()
    pred_proba = F.softmax(logits, dim=1)
    print(f"Accuracy: {acc * 100:.2f}%")


Epoch 001: loss = 3.4037, acc = 0.0381, time = 0.002s
Epoch 002: loss = 3.3486, acc = 0.0796, time = 0.002s
Epoch 003: loss = 3.2911, acc = 0.1523, time = 0.002s
Epoch 004: loss = 3.2204, acc = 0.1714, time = 0.001s
Epoch 005: loss = 3.1318, acc = 0.2182, time = 0.001s
Epoch 006: loss = 3.0258, acc = 0.2292, time = 0.003s
Epoch 007: loss = 2.9051, acc = 0.2373, time = 0.002s
Epoch 008: loss = 2.7745, acc = 0.2691, time = 0.005s
Epoch 009: loss = 2.6386, acc = 0.2907, time = 0.003s
Epoch 010: loss = 2.5011, acc = 0.3132, time = 0.002s
Epoch 011: loss = 2.3645, acc = 0.3379, time = 0.002s
Epoch 012: loss = 2.2311, acc = 0.3596, time = 0.002s
Epoch 013: loss = 2.1045, acc = 0.3840, time = 0.002s
Epoch 014: loss = 1.9875, acc = 0.4150, time = 0.004s
Epoch 015: loss = 1.8801, acc = 0.4371, time = 0.005s
Epoch 016: loss = 1.7807, acc = 0.4548, time = 0.003s
Epoch 017: loss = 1.6875, acc = 0.4810, time = 0.003s
Epoch 018: loss = 1.6015, acc = 0.4967, time = 0.003s
Epoch 019: loss = 1.5211, ac

In [10]:
from sklearn.preprocessing import label_binarize
y_true_decoded = le.inverse_transform(Ytest.cpu().numpy())
y_pred_decoded = le.inverse_transform(y_pred.cpu().numpy())
y_score = pred_proba.cpu().numpy()
y_true_bin = label_binarize(Ytest.cpu().numpy(), classes=list(range(30)))


In [11]:
import os
file_path = os.path.join("results", f"gtd{partition}.txt")

# Make sure the directory exists
os.makedirs("results", exist_ok=True)

# Write a string to the file
with open(file_path, "w") as file:
    file.write(f"Accuracy: {acc:.4f}\n")
    file.write(f"Precision weighted: {precision_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")
    file.write(f"Recall weighted: {recall_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")
    file.write(f"F1 Score weighted: {f1_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")
    file.write(f"ROCAUC Weighted: {roc_auc_score(y_true_bin, y_score, average='weighted', multi_class='ovr'):.4f}\n")


    file.write(f"Precision micro: {precision_score(y_true_decoded, y_pred_decoded, average='micro'):.4f}\n")
    file.write(f"Recall micro: {recall_score(y_true_decoded, y_pred_decoded, average='micro'):.4f}\n")
    file.write(f"F1 Score micro: {f1_score(y_true_decoded, y_pred_decoded, average='micro'):.4f}\n")
    file.write(f"ROCAUC micro: {roc_auc_score(y_true_bin, y_score, average='micro', multi_class='ovr'):.4f}\n")

    file.write(f"Precision macro: {precision_score(y_true_decoded, y_pred_decoded, average='macro'):.4f}\n")
    file.write(f"Recall macro: {recall_score(y_true_decoded, y_pred_decoded, average='macro'):.4f}\n")
    file.write(f"F1 Score macro: {f1_score(y_true_decoded, y_pred_decoded, average='macro'):.4f}\n")
    file.write(f"ROCAUC macro: {roc_auc_score(y_true_bin, y_score, average='macro', multi_class='ovr'):.4f}\n")

with open(f"results/epoch_logs_gtd{partition}", "w") as f:
    f.write('\n'.join(str(x) for x in epoch_times))

In [12]:
print(classification_report(y_true_decoded, y_pred_decoded))

                                                  precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.94      0.97      0.96       144
        African National Congress (South Africa)       1.00      1.00      1.00       144
                                Al-Qaida in Iraq       0.82      0.80      0.81       144
        Al-Qaida in the Arabian Peninsula (AQAP)       0.90      0.83      0.86       144
                                      Al-Shabaab       0.98      0.98      0.98       144
             Basque Fatherland and Freedom (ETA)       0.99      0.99      0.99       144
                                      Boko Haram       0.93      0.94      0.93       144
  Communist Party of India - Maoist (CPI-Maoist)       0.88      0.90      0.89       144
       Corsican National Liberation Front (FLNC)       0.99      1.00      1.00       144
                       Donetsk People's Republic       0.99      0.99      0.99       144
Farabundo

In [13]:
print(best_mlp)

SimpleMLP(
  (model): Sequential(
    (0): Linear(in_features=15, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=50, bias=True)
    (3): ReLU()
    (4): Linear(in_features=50, out_features=30, bias=True)
  )
)


In [14]:
def plot_confusion_matrix(y_true, y_pred, labels):
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)

    plt.figure(figsize=(18, 16))
    sns.heatmap(cm_normalized,
                annot=True,
                fmt=".2f",
                xticklabels=labels,
                yticklabels=labels,
                cmap="viridis",
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8})

    plt.title(f"Normalized Confusion Matrix (Partition {partition})", fontsize=18)
    plt.xlabel("Predicted Label", fontsize=14)
    plt.ylabel("True Label", fontsize=14)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save the figure
    save_path = f"results/confusion_matrix_partition_{partition}.png"
    plt.savefig(save_path, dpi=300)
    plt.close()

    print(f"Saved confusion matrix for partition {partition} to {save_path}")


In [15]:

# Get all unique class labels from the truths
class_labels = np.unique(y_true_decoded)

plot_confusion_matrix(y_true_decoded, y_pred_decoded, labels=class_labels)



Saved confusion matrix for partition 478 to results/confusion_matrix_partition_478.png
