In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
import sys

In [2]:
partition = 200

In [3]:
!{sys.executable} -m pip install pytorch-tabnet --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
trainpath = f'../../../../data/top30groups/noGeographic/train1/train{partition}.csv'
testpath = f'../../../../data/top30groups/noGeographic/test1/test{partition}.csv'

traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [5]:
import torch
print(torch.cuda.is_available())

True


In [6]:
import torch
torch.cuda.empty_cache()


In [7]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

def split_data(dftrain, dftest):
    Xtrain = dftrain.drop(columns=['gname']).values.astype(float)
    Ytrain = dftrain['gname'].values
    Xtest = dftest.drop(columns=['gname']).values.astype(float)
    Ytest = dftest['gname'].values

    le = LabelEncoder()
    Ytrain = le.fit_transform(Ytrain)
    Ytest = le.transform(Ytest)

    #y_pred_decoded = model.label_encoder.inverse_transform(y_pred)
    y_true_decoded = le.inverse_transform(Ytest)

    return Xtrain, Ytrain, Xtest, Ytest, y_true_decoded, le



In [8]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier

class TabNetClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, n_d=8, n_a=8, n_steps=3, gamma=1.3, lambda_sparse=1e-3, optimizer_params=None):
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.lambda_sparse = lambda_sparse
        self.optimizer_params = optimizer_params or {'lr': 0.01}
        self.model = None

    def fit(self, X, y):
        self.model = TabNetClassifier(
            n_d=self.n_d,
            n_a=self.n_a,
            n_steps=self.n_steps,
            gamma=self.gamma,
            lambda_sparse=self.lambda_sparse,
            optimizer_params=self.optimizer_params,
            seed=42,
            verbose=0
        )

        self.model.fit(
            X, y,
            eval_set=[(X, y)],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            eval_metric=['accuracy']
        )

        self.classes_ = np.unique(y)  # ✅ Needed for sklearn compatibility
        return self

    def predict(self, X):
        return self.model.predict(X)

    def score(self, X, y):
        preds = self.predict(X)
        return (preds == y).mean()


In [9]:
#from pytorch_tabnet.sklearn import TabNetClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

def find_best_tabnet(Xtrain, Ytrain, n_iter=20):
    print("Starting TabNet grid search")
    print("CUDA available:", torch.cuda.is_available())

    param_dist = {
        'n_d': [8, 16, 24],
        'n_a': [8, 16, 24],
        'n_steps': [3, 4, 5],
        'gamma': [1.0, 1.3, 1.5],
        'lambda_sparse': [1e-4, 1e-3, 1e-2],
        'optimizer_params': [{'lr': 0.01}]
    }

    random_search = RandomizedSearchCV(
        estimator=TabNetClassifierWrapper(),
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring='accuracy',
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    random_search.fit(Xtrain, Ytrain)
    print("Best parameters:", random_search.best_params_)
    print("Best accuracy:", random_search.best_score_)

    return random_search.best_params_



In [10]:
Xtrain, Ytrain, Xtest, Ytest, Ytest_decoded, le = split_data(traindata, testdata)
best_tabnet_params = find_best_tabnet(Xtrain, Ytrain)


# Re-initialize TabNet with best params
final_model = TabNetClassifier(
    **{k: v for k, v in best_tabnet_params.items()},
    verbose=1,
    seed=42
)

# Retrain on full training data
final_model.fit(
    Xtrain, Ytrain,
    max_epochs=100,
    patience=20,
    batch_size=1024,
    virtual_batch_size=128
)



Starting TabNet grid search
CUDA available: True
Fitting 5 folds for each of 20 candidates, totalling 100 fits



Early stopping occurred at epoch 23 with best_epoch = 3 and best_val_0_accuracy = 0.06905





Early stopping occurred at epoch 21 with best_epoch = 1 and best_val_0_accuracy = 0.05387





Early stopping occurred at epoch 31 with best_epoch = 11 and best_val_0_accuracy = 0.08988





Early stopping occurred at epoch 28 with best_epoch = 8 and best_val_0_accuracy = 0.06756





Early stopping occurred at epoch 29 with best_epoch = 9 and best_val_0_accuracy = 0.06369





Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_accuracy = 0.10923





Early stopping occurred at epoch 31 with best_epoch = 11 and best_val_0_accuracy = 0.06548




Stop training because you reached max_epochs = 200 with best_epoch = 195 and best_val_0_accuracy = 0.66905





Early stopping occurred at epoch 175 with best_epoch = 155 and best_val_0_accuracy = 0.65923





Early stopping occurred at epoch 181 with best_epoch = 161 and best_val_0_accuracy = 0.67143





Early stopping occurred at epoch 151 with best_epoch = 131 and best_val_0_accuracy = 0.60536




Stop training because you reached max_epochs = 200 with best_epoch = 183 and best_val_0_accuracy = 0.67202




Stop training because you reached max_epochs = 200 with best_epoch = 195 and best_val_0_accuracy = 0.6875




Stop training because you reached max_epochs = 200 with best_epoch = 187 and best_val_0_accuracy = 0.6619




Stop training because you reached max_epochs = 200 with best_epoch = 194 and best_val_0_accuracy = 0.66726




Stop training because you reached max_epochs = 200 with best_epoch = 198 and best_val_0_accuracy = 0.67054





Early stopping occurred at epoch 169 with best_epoch = 149 and best_val_0_accuracy = 0.61637





Early stopping occurred at epoch 169 with best_epoch = 149 and best_val_0_accuracy = 0.58631





Early stopping occurred at epoch 171 with best_epoch = 151 and best_val_0_accuracy = 0.57917





Early stopping occurred at epoch 181 with best_epoch = 161 and best_val_0_accuracy = 0.59762





Early stopping occurred at epoch 185 with best_epoch = 165 and best_val_0_accuracy = 0.60804





Early stopping occurred at epoch 190 with best_epoch = 170 and best_val_0_accuracy = 0.63036





Early stopping occurred at epoch 193 with best_epoch = 173 and best_val_0_accuracy = 0.62708





Early stopping occurred at epoch 198 with best_epoch = 178 and best_val_0_accuracy = 0.69435




Stop training because you reached max_epochs = 200 with best_epoch = 195 and best_val_0_accuracy = 0.62679




Stop training because you reached max_epochs = 200 with best_epoch = 191 and best_val_0_accuracy = 0.6381

Early stopping occurred at epoch 197 with best_epoch = 177 and best_val_0_accuracy = 0.59851




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.63958
Stop training because you reached max_epochs = 200 with best_epoch = 190 and best_val_0_accuracy = 0.63452





Early stopping occurred at epoch 177 with best_epoch = 157 and best_val_0_accuracy = 0.62649





Early stopping occurred at epoch 153 with best_epoch = 133 and best_val_0_accuracy = 0.58423





Early stopping occurred at epoch 38 with best_epoch = 18 and best_val_0_accuracy = 0.07351




Stop training because you reached max_epochs = 200 with best_epoch = 192 and best_val_0_accuracy = 0.64762

Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_accuracy = 0.07946




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.63988
Stop training because you reached max_epochs = 200 with best_epoch = 194 and best_val_0_accuracy = 0.66726




Stop training because you reached max_epochs = 200 with best_epoch = 185 and best_val_0_accuracy = 0.6631




Stop training because you reached max_epochs = 200 with best_epoch = 192 and best_val_0_accuracy = 0.63125




Stop training because you reached max_epochs = 200 with best_epoch = 196 and best_val_0_accuracy = 0.62887





Early stopping occurred at epoch 29 with best_epoch = 9 and best_val_0_accuracy = 0.06399





Early stopping occurred at epoch 182 with best_epoch = 162 and best_val_0_accuracy = 0.65238
Stop training because you reached max_epochs = 200 with best_epoch = 189 and best_val_0_accuracy = 0.62411





Early stopping occurred at epoch 176 with best_epoch = 156 and best_val_0_accuracy = 0.62887





Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_accuracy = 0.0875





Early stopping occurred at epoch 182 with best_epoch = 162 and best_val_0_accuracy = 0.61101





Early stopping occurred at epoch 189 with best_epoch = 169 and best_val_0_accuracy = 0.63929




Stop training because you reached max_epochs = 200 with best_epoch = 190 and best_val_0_accuracy = 0.62411

Early stopping occurred at epoch 150 with best_epoch = 130 and best_val_0_accuracy = 0.59673




Stop training because you reached max_epochs = 200 with best_epoch = 197 and best_val_0_accuracy = 0.64911




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.63423




Stop training because you reached max_epochs = 200 with best_epoch = 197 and best_val_0_accuracy = 0.59345





Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_accuracy = 0.06577





Early stopping occurred at epoch 124 with best_epoch = 104 and best_val_0_accuracy = 0.51875




Stop training because you reached max_epochs = 200 with best_epoch = 194 and best_val_0_accuracy = 0.61607




Stop training because you reached max_epochs = 200 with best_epoch = 191 and best_val_0_accuracy = 0.60804





Early stopping occurred at epoch 167 with best_epoch = 147 and best_val_0_accuracy = 0.62083




Stop training because you reached max_epochs = 200 with best_epoch = 198 and best_val_0_accuracy = 0.61607





Early stopping occurred at epoch 135 with best_epoch = 115 and best_val_0_accuracy = 0.48542

Early stopping occurred at epoch 149 with best_epoch = 129 and best_val_0_accuracy = 0.54881




Stop training because you reached max_epochs = 200 with best_epoch = 187 and best_val_0_accuracy = 0.6372





Early stopping occurred at epoch 41 with best_epoch = 21 and best_val_0_accuracy = 0.10298




Stop training because you reached max_epochs = 200 with best_epoch = 198 and best_val_0_accuracy = 0.61071





Early stopping occurred at epoch 145 with best_epoch = 125 and best_val_0_accuracy = 0.51667





Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_accuracy = 0.0753





Early stopping occurred at epoch 171 with best_epoch = 151 and best_val_0_accuracy = 0.63929




Stop training because you reached max_epochs = 200 with best_epoch = 196 and best_val_0_accuracy = 0.5506




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.54673




Stop training because you reached max_epochs = 200 with best_epoch = 182 and best_val_0_accuracy = 0.54286




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.55804





Early stopping occurred at epoch 175 with best_epoch = 155 and best_val_0_accuracy = 0.61607




Stop training because you reached max_epochs = 200 with best_epoch = 186 and best_val_0_accuracy = 0.55595
Stop training because you reached max_epochs = 200 with best_epoch = 198 and best_val_0_accuracy = 0.5872




Stop training because you reached max_epochs = 200 with best_epoch = 191 and best_val_0_accuracy = 0.54405




Stop training because you reached max_epochs = 200 with best_epoch = 197 and best_val_0_accuracy = 0.56815
Stop training because you reached max_epochs = 200 with best_epoch = 186 and best_val_0_accuracy = 0.58155




Stop training because you reached max_epochs = 200 with best_epoch = 195 and best_val_0_accuracy = 0.59256




Stop training because you reached max_epochs = 200 with best_epoch = 196 and best_val_0_accuracy = 0.65446




Stop training because you reached max_epochs = 200 with best_epoch = 181 and best_val_0_accuracy = 0.65208




Stop training because you reached max_epochs = 200 with best_epoch = 186 and best_val_0_accuracy = 0.63988




Stop training because you reached max_epochs = 200 with best_epoch = 192 and best_val_0_accuracy = 0.69643





Early stopping occurred at epoch 197 with best_epoch = 177 and best_val_0_accuracy = 0.66994





Early stopping occurred at epoch 197 with best_epoch = 177 and best_val_0_accuracy = 0.67083





Early stopping occurred at epoch 169 with best_epoch = 149 and best_val_0_accuracy = 0.49851





Early stopping occurred at epoch 158 with best_epoch = 138 and best_val_0_accuracy = 0.50774





Early stopping occurred at epoch 163 with best_epoch = 143 and best_val_0_accuracy = 0.61637





Early stopping occurred at epoch 197 with best_epoch = 177 and best_val_0_accuracy = 0.62619




Stop training because you reached max_epochs = 200 with best_epoch = 190 and best_val_0_accuracy = 0.64613




Stop training because you reached max_epochs = 200 with best_epoch = 192 and best_val_0_accuracy = 0.6625




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.65982





Early stopping occurred at epoch 178 with best_epoch = 158 and best_val_0_accuracy = 0.55




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.51458




Stop training because you reached max_epochs = 200 with best_epoch = 192 and best_val_0_accuracy = 0.66548
Stop training because you reached max_epochs = 200 with best_epoch = 188 and best_val_0_accuracy = 0.6372




Stop training because you reached max_epochs = 200 with best_epoch = 194 and best_val_0_accuracy = 0.56637




Stop training because you reached max_epochs = 200 with best_epoch = 192 and best_val_0_accuracy = 0.66458




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.5872




Stop training because you reached max_epochs = 200 with best_epoch = 190 and best_val_0_accuracy = 0.56815




Stop training because you reached max_epochs = 200 with best_epoch = 199 and best_val_0_accuracy = 0.56429




Stop training because you reached max_epochs = 200 with best_epoch = 185 and best_val_0_accuracy = 0.58214




Stop training because you reached max_epochs = 200 with best_epoch = 191 and best_val_0_accuracy = 0.54018




Stop training because you reached max_epochs = 200 with best_epoch = 186 and best_val_0_accuracy = 0.64548
Best parameters: {'optimizer_params': {'lr': 0.01}, 'n_steps': 3, 'n_d': 8, 'n_a': 16, 'lambda_sparse': 0.0001, 'gamma': 1.0}
Best accuracy: 0.575
epoch 0  | loss: 3.95394 |  0:00:00s
epoch 1  | loss: 3.59362 |  0:00:00s
epoch 2  | loss: 3.46213 |  0:00:00s
epoch 3  | loss: 3.40159 |  0:00:00s
epoch 4  | loss: 3.35781 |  0:00:00s
epoch 5  | loss: 3.3163  |  0:00:00s
epoch 6  | loss: 3.28717 |  0:00:00s
epoch 7  | loss: 3.25293 |  0:00:00s
epoch 8  | loss: 3.22941 |  0:00:00s
epoch 9  | loss: 3.19778 |  0:00:00s
epoch 10 | loss: 3.17674 |  0:00:00s
epoch 11 | loss: 3.13605 |  0:00:01s
epoch 12 | loss: 3.11012 |  0:00:01s
epoch 13 | loss: 3.08474 |  0:00:01s
epoch 14 | loss: 3.04378 |  0:00:01s
epoch 15 | loss: 3.01169 |  0:00:01s
epoch 16 | loss: 2.98146 |  0:00:01s
epoch 17 | loss: 2.94835 |  0:00:01s
epoch 18 | loss: 2.91861 |  0:00:01s
epoch 19 | loss: 2.88897 |  0:00:01s
epoch 

In [11]:
_, _, _, _, _, le = split_data(traindata, testdata)


In [12]:
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

file_path = os.path.join("results", f"gtd{partition}.txt")

# Predict class indices for test set
y_pred = final_model.predict(Xtest)
y_pred_decoded = le.inverse_transform(y_pred)
y_true_decoded = le.inverse_transform(Ytest)

# Make sure the directory exists
os.makedirs("results", exist_ok=True)

# Compute accuracy from decoded labels
acc = accuracy_score(y_true_decoded, y_pred_decoded)

# Write metrics to file
with open(file_path, "w") as file:
    file.write(f"Accuracy: {acc:.4f}\n")
    file.write(f"Precision: {precision_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")
    file.write(f"Recall: {recall_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")
    file.write(f"F1 Score: {f1_score(y_true_decoded, y_pred_decoded, average='weighted'):.4f}\n")


In [13]:
print(classification_report(Ytest_decoded, y_pred_decoded))

                                                  precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.16      0.30      0.21        60
        African National Congress (South Africa)       0.48      0.73      0.58        60
                                Al-Qaida in Iraq       0.32      0.43      0.37        60
        Al-Qaida in the Arabian Peninsula (AQAP)       0.20      0.25      0.22        60
                                      Al-Shabaab       0.22      0.22      0.22        60
             Basque Fatherland and Freedom (ETA)       0.44      0.85      0.58        60
                                      Boko Haram       0.32      0.27      0.29        60
  Communist Party of India - Maoist (CPI-Maoist)       0.58      0.37      0.45        60
       Corsican National Liberation Front (FLNC)       0.44      0.68      0.54        60
                       Donetsk People's Republic       0.49      0.45      0.47        60
Farabundo

In [14]:
def plot_confusion_matrix(y_true, y_pred, labels):
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)

    plt.figure(figsize=(18, 16))
    sns.heatmap(cm_normalized,
                annot=True,
                fmt=".2f",
                xticklabels=labels,
                yticklabels=labels,
                cmap="viridis",
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8})

    plt.title(f"Normalized Confusion Matrix", fontsize=18)
    plt.xlabel("Predicted Label", fontsize=14)
    plt.ylabel("True Label", fontsize=14)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save the figure
    save_path = f"results/confusion_matrix_partition_{partition}.png"
    plt.savefig(save_path, dpi=300)
    plt.close()

    print(f"Saved confusion matrix for partition {partition} to {save_path}")


In [15]:
# Get all unique class labels from the truths
class_labels = np.unique(Ytest_decoded)

plot_confusion_matrix(Ytest_decoded, y_pred_decoded, labels=class_labels)



Saved confusion matrix for partition 200 to results/confusion_matrix_partition_200.png
