In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor


In [2]:
trainpath = '../../CleanPartitions/trainp3.csv'
testpath = '../../CleanPartitions/testp3.csv'

traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

if 'attack_date' in traindata.columns:
    traindata = traindata.drop(columns=['attack_date'])

if 'attack_date' in testdata.columns:
    testdata = testdata.drop(columns=['attack_date'])

    print(f'shape train data: ', traindata.shape)
    print(f'shape test data: ', testdata.shape)

shape train data:  (1061, 25)
shape test data:  (452, 25)


In [3]:
def split_data(dftrain, dftest):
    Ytrain = dftrain['gname']
    Xtrain = dftrain.drop(columns=['gname'])
    Ytest = dftest['gname']
    Xtest = dftest.drop(columns=['gname'])
    return Xtrain, Ytrain, Xtest, Ytest

from itertools import product

def find_best_tabnet(Xtrain, Ytrain):
    param_grid = {
        'n_d': [8, 16, 24],
        'n_a': [8, 16, 24],
        'n_steps': [3, 4, 5],
        'gamma': [1.0, 1.3, 1.5],
        'lambda_sparse': [1e-4, 1e-3],
        'lr': [0.005, 0.01, 0.02],  # default is 0.02
    }


    param_combos = list(product(*param_grid.values()))
    param_names = list(param_grid.keys())

    tscv = TimeSeriesSplit(n_splits=5)
    best_score = 0
    best_model = None
    best_params = None

    for combo in param_combos:
        params = dict(zip(param_names, combo))
        print(f"Trying params: {params}")
        scores = []

        for train_idx, val_idx in tscv.split(Xtrain):
            X_tr, X_val = Xtrain.iloc[train_idx].values, Xtrain.iloc[val_idx].values
            y_tr, y_val = Ytrain.iloc[train_idx].values, Ytrain.iloc[val_idx].values

            model = TabNetClassifier(
                n_d=params['n_d'],
                n_a=params['n_a'],
                n_steps=params['n_steps'],
                gamma=params['gamma'],
                lambda_sparse=params['lambda_sparse'],
                optimizer_params=dict(lr=params['lr']),
                verbose=0,
                seed=42
            )

            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                eval_metric=['accuracy'],
                patience=10,
                max_epochs=100,
                batch_size=1024,
                virtual_batch_size=128
            )

            preds = model.predict(X_val)
            acc = accuracy_score(y_val, preds)
            scores.append(acc)

        avg_score = np.mean(scores)
        print(f"CV accuracy: {avg_score:.4f}")

        if avg_score > best_score:
            best_score = avg_score
            best_model = model
            best_params = params

    print(f"\nBest TabNet params: {best_params}, accuracy: {best_score:.4f}")
    return best_params

In [4]:
#split 70/30
Xtrain, Ytrain, Xtest, Ytest = split_data(traindata, testdata)

#use split further so we get 70/15/15
idx = int(len(Xtest)/2)
Xval = Xtest.iloc[:idx]
Yval = Ytest.iloc[:idx]
Xtest = Xtest.iloc[idx:]
Ytest = Ytest.iloc[idx:]

print(Xtest.shape)
best_params = find_best_tabnet(Xtrain, Ytrain)

# Retrain best model on full training set
best_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    optimizer_params=dict(lr=best_params['lr']),
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    verbose=0,
    seed=42
)

best_model.fit(
    Xtrain.values, Ytrain.values,
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

preds = best_model.predict(Xtest.values)
acc = accuracy_score(Ytest, preds)
print(f"Test Accuracy: {acc:.4f}")
#clf = TabNetClassifier()  #TabNetRegressor()
#clf.fit(
#  Xtrain.values, Ytrain.values,
#  eval_set=[(Xval.values, Yval.values)]
#)
#preds = clf.predict(Xtest.values)

#best_mlp = find_best_mlp(Xtrain, Ytrain)
#y_pred_mlp = best_mlp.predict(Xtest)
#accuracy_mlp = accuracy_score(Ytest, y_pred_mlp)
#print(f"Accuracy: {accuracy_mlp * 100:.2f}%")

(226, 24)
Trying params: {'n_d': 8, 'n_a': 8, 'n_steps': 3, 'gamma': 1.0, 'lambda_sparse': 0.0001, 'lr': 0.005}

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.49432

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.39773

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.22727

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.40909

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.84091
CV accuracy: 0.4739
Trying params: {'n_d': 8, 'n_a': 8, 'n_steps': 3, 'gamma': 1.0, 'lambda_sparse': 0.0001, 'lr': 0.01}

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.49432

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.39773

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_accuracy = 0.22727

Early stopping occurred at epoch 10 wi

In [5]:
accuracy_mlp = accuracy_score(Ytest, preds)
accuracy_mlp


0.9336283185840708

In [6]:
#print(best_mlp)