In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
import sys

In [2]:
partition = 478

In [3]:
!{sys.executable} -m pip install pytorch-tabnet --upgrade

Collecting pytorch-tabnet
  Using cached pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Using cached pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
trainpath = f'../../../../data/top30groups/DNDF_OneHotLongLatCombined/train/train{partition}.csv'
testpath = f'../../../../data/top30groups/DNDF_OneHotLongLatCombined/test/test{partition}.csv'

traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [5]:
import torch
print(torch.cuda.is_available())

True


In [6]:
import torch
torch.cuda.empty_cache()


In [7]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

def split_data(dftrain, dftest):
    Xtrain = dftrain.drop(columns=['gname']).values.astype(float)
    Ytrain = dftrain['gname'].values
    Xtest = dftest.drop(columns=['gname']).values.astype(float)
    Ytest = dftest['gname'].values

    le = LabelEncoder()
    Ytrain = le.fit_transform(Ytrain)
    Ytest = le.transform(Ytest)

    #y_pred_decoded = model.label_encoder.inverse_transform(y_pred)
    y_true_decoded = le.inverse_transform(Ytest)

    return Xtrain, Ytrain, Xtest, Ytest, y_true_decoded, le



In [8]:
#from pytorch_tabnet.sklearn import TabNetClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor


def find_best_tabnet(Xtrain, Ytrain, n_iter=20):
    print("Starting TabNet grid search")
    print("CUDA available:", torch.cuda.is_available())
    model = TabNetClassifier(verbose=0, seed=42)

    param_dist = {
        'n_d': [8, 16, 24],
        'n_a': [8, 16, 24],
        'n_steps': [3, 4, 5],
        'gamma': [1.0, 1.3, 1.5],
        'lambda_sparse': [1e-4, 1e-3, 1e-2],
        'optimizer_params': [{'lr': 0.01}]
    }

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring='accuracy',
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    fit_params = {
    'max_epochs': 100,        # adjust as needed
    'batch_size': 64,        # lower if you get CUDA OOM
    'virtual_batch_size': 16, # smaller "sub-batches" to reduce memory usage
    'patience': 10,
    'drop_last': False
}

    random_search.fit(Xtrain, Ytrain, **fit_params)
    print("Best parameters:", random_search.best_params_)
    print("Best accuracy:", random_search.best_score_)

    return random_search.best_params_


In [9]:
torch.cuda.empty_cache()

In [10]:
Xtrain, Ytrain, Xtest, Ytest, Ytest_decoded, le = split_data(traindata, testdata)
best_tabnet_params = find_best_tabnet(Xtrain, Ytrain)


# Re-initialize TabNet with best params
final_model = TabNetClassifier(
    **{k: v for k, v in best_tabnet_params.items()},
    verbose=1,
    seed=42
)

fit_params = {
    'max_epochs': 100,        # adjust as needed
    'batch_size': 64,        # lower if you get CUDA OOM
    'virtual_batch_size': 16, # smaller "sub-batches" to reduce memory usage
    'patience': 10,
    'drop_last': False
}

# Retrain on full training data
final_model.fit(
    Xtrain, Ytrain, **fit_params
)



Starting TabNet grid search
CUDA available: True
Fitting 5 folds for each of 20 candidates, totalling 100 fits




Best parameters: {'optimizer_params': {'lr': 0.01}, 'n_steps': 3, 'n_d': 24, 'n_a': 24, 'lambda_sparse': 0.0001, 'gamma': 1.0}
Best accuracy: 0.5263473053892216
epoch 0  | loss: 3.57005 |  0:00:06s
epoch 1  | loss: 3.07908 |  0:00:13s
epoch 2  | loss: 2.84224 |  0:00:19s
epoch 3  | loss: 2.65967 |  0:00:25s
epoch 4  | loss: 2.46656 |  0:00:31s
epoch 5  | loss: 2.28658 |  0:00:36s
epoch 6  | loss: 2.121   |  0:00:43s
epoch 7  | loss: 1.96537 |  0:00:48s
epoch 8  | loss: 1.81837 |  0:00:55s
epoch 9  | loss: 1.69293 |  0:01:01s
epoch 10 | loss: 1.58022 |  0:01:07s
epoch 11 | loss: 1.44674 |  0:01:13s
epoch 12 | loss: 1.31091 |  0:01:19s
epoch 13 | loss: 1.20287 |  0:01:25s
epoch 14 | loss: 1.10642 |  0:01:31s
epoch 15 | loss: 1.02325 |  0:01:37s
epoch 16 | loss: 0.92411 |  0:01:44s
epoch 17 | loss: 0.83722 |  0:01:50s
epoch 18 | loss: 0.79302 |  0:01:56s
epoch 19 | loss: 0.72069 |  0:02:01s
epoch 20 | loss: 0.65682 |  0:02:07s
epoch 21 | loss: 0.60154 |  0:02:12s
epoch 22 | loss: 0.56125 

In [11]:
from sklearn.metrics import accuracy_score
import os

# Predict class indices for test set
y_pred = final_model.predict(Xtest)
y_pred_decoded = le.inverse_transform(y_pred)

# Compute and print accuracy
acc = accuracy_score(Ytest, y_pred)
file_path = os.path.join("results", f"gtd{partition}.txt")

# Make sure the directory exists
os.makedirs("results", exist_ok=True)

# Write a string to the file
with open(file_path, "w") as file:
    file.write(str(acc))

In [12]:
import os

# Make sure the directory exists
os.makedirs("Results", exist_ok=True)

with open(f'Results/tabnet_{partition}', "w") as f:
    f.write(f"Accuracy: {acc*100:.2f}%\n")

In [13]:
print(classification_report(Ytest_decoded, y_pred_decoded))

                                                  precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.80      0.42      0.55       144
        African National Congress (South Africa)       0.90      0.72      0.80       144
                                Al-Qaida in Iraq       0.86      0.50      0.63       144
        Al-Qaida in the Arabian Peninsula (AQAP)       0.70      0.61      0.65       144
                                      Al-Shabaab       0.91      0.69      0.78       144
             Basque Fatherland and Freedom (ETA)       0.66      0.66      0.66       144
                                      Boko Haram       0.56      0.43      0.49       144
  Communist Party of India - Maoist (CPI-Maoist)       0.51      0.34      0.41       144
       Corsican National Liberation Front (FLNC)       0.90      0.89      0.89       144
                       Donetsk People's Republic       0.74      0.71      0.72       144
Farabundo

In [14]:
def plot_confusion_matrix(y_true, y_pred, labels):
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)

    plt.figure(figsize=(18, 16))
    sns.heatmap(cm_normalized,
                annot=True,
                fmt=".2f",
                xticklabels=labels,
                yticklabels=labels,
                cmap="viridis",
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8})

    plt.title(f"Normalized Confusion Matrix", fontsize=18)
    plt.xlabel("Predicted Label", fontsize=14)
    plt.ylabel("True Label", fontsize=14)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save the figure
    save_path = f"Results/confusion_matrix_partition_{partition}.png"
    plt.savefig(save_path, dpi=300)
    plt.close()

    print(f"Saved confusion matrix for partition {partition} to {save_path}")


In [15]:
# Get all unique class labels from the truths
class_labels = np.unique(Ytest_decoded)

plot_confusion_matrix(Ytest_decoded, y_pred_decoded, labels=class_labels)



Saved confusion matrix for partition 478 to Results/confusion_matrix_partition_478.png
