In [1]:
import numpy as np
import pandas as pd
import torch
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from pytorch_tabnet.tab_model import TabNetClassifier
import joblib
import json
import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

def reproducibility_establishment(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_pi = 314159
reproducibility_establishment(seed_value=seed_pi)

df_data_1 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\Benign-Monday-no-metadata.parquet', engine='pyarrow')
df_data_2 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\Botnet-Friday-no-metadata.parquet', engine='pyarrow')
df_data_3 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\Bruteforce-Tuesday-no-metadata.parquet', engine='pyarrow')
df_data_4 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\DDoS-Friday-no-metadata.parquet', engine='pyarrow')
df_data_5 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\DoS-Wednesday-no-metadata.parquet', engine='pyarrow')
df_data_6 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\Infiltration-Thursday-no-metadata.parquet', engine='pyarrow')
df_data_7 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\Portscan-Friday-no-metadata.parquet', engine='pyarrow')
df_data_8 = pd.read_parquet(r'C:\Users\hos\Downloads\osiriswatch\dataset\WebAttacks-Thursday-no-metadata.parquet', engine='pyarrow')

df_data = pd.concat([df_data_1, df_data_2, df_data_3, df_data_4, 
                     df_data_5, df_data_6, df_data_7, df_data_8], axis=0, ignore_index=True)

df_data.dropna(inplace=True)
df_data.drop_duplicates(inplace=True)
df_data.reset_index(drop=True, inplace=True)

print("Categorical columns:", df_data.select_dtypes(include=['object']).columns.tolist(), '\n')
print("Shape of Dataframe: ", df_data.shape, '\n')
print('Inspection of Target Feature - Label:\n')
print(df_data['Label'].value_counts())


Device: cpu
Categorical columns: ['Label'] 

Shape of Dataframe:  (2231806, 78) 

Inspection of Target Feature - Label:

Label
Benign                        1895314
DoS Hulk                       172846
DDoS                           128014
DoS GoldenEye                   10286
FTP-Patator                      5931
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
PortScan                         1956
Web Attack � Brute Force         1470
Bot                              1437
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


In [2]:
label_encoder = LabelEncoder()
df_data['Label'] = df_data['Label'].astype(str)
y = label_encoder.fit_transform(df_data['Label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

X = df_data.drop('Label', axis=1)

def extractAllSets(X, y, p_train, p_val, p_test, random_state, shuffle=True):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, stratify=y, test_size=(1.0 - p_train), random_state=random_state, shuffle=shuffle)
    fraction = p_test / (p_val + p_test)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, stratify=y_temp, test_size=fraction, random_state=random_state, shuffle=shuffle)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = extractAllSets(X, y, 0.75, 0.10, 0.15, seed_pi)

def r_scale(X_train, X_val, X_test):
    scaler = RobustScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler

X_train_r, X_val_r, X_test_r, scaler = r_scale(X_train, X_val, X_test)
X_train, X_val, X_test = X_train_r, X_val_r, X_test_r

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(classes, class_weights)}
print("Class Weights:", class_weight_dict)

clf_params = dict(
    n_d=77,
    n_a=77,
    n_steps=5,
    gamma=1.85,
    cat_idxs=[],
    cat_dims=[],
    cat_emb_dim=[],
    n_independent=2,
    n_shared=2,
    epsilon=1e-15,
    momentum=0.02,
    lambda_sparse=0.001,
    seed=314159,
    clip_value=4.5,
    verbose=0,
    optimizer_fn=torch.optim.Adam,
    optimizer_params={'lr': 0.01},
    scheduler_fn=torch.optim.lr_scheduler.ExponentialLR,
    scheduler_params={'verbose': False, 'gamma': 0.9},
    mask_type='sparsemax',
    input_dim=X_train.shape[1],
    output_dim=len(np.unique(y_train)),
    device_name=device,
    n_shared_decoder=1,
    n_indep_decoder=1,
    grouped_features=[]
)


Label Mapping: {'Benign': np.int64(0), 'Bot': np.int64(1), 'DDoS': np.int64(2), 'DoS GoldenEye': np.int64(3), 'DoS Hulk': np.int64(4), 'DoS Slowhttptest': np.int64(5), 'DoS slowloris': np.int64(6), 'FTP-Patator': np.int64(7), 'Heartbleed': np.int64(8), 'Infiltration': np.int64(9), 'PortScan': np.int64(10), 'SSH-Patator': np.int64(11), 'Web Attack � Brute Force': np.int64(12), 'Web Attack � Sql Injection': np.int64(13), 'Web Attack � XSS': np.int64(14)}
Class Weights: {np.int64(0): np.float64(0.07850259880805402), np.int64(1): np.float64(103.51601731601731), np.int64(2): np.float64(1.1622775405339723), np.int64(3): np.float64(14.464065672931518), np.int64(4): np.float64(0.8608101783996996), np.int64(5): np.float64(28.45964464847403), np.int64(6): np.float64(27.628191796649336), np.int64(7): np.float64(25.087739808153476), np.int64(8): np.float64(13948.783333333333), np.int64(9): np.float64(4132.972839506173), np.int64(10): np.float64(76.06698477618723), np.int64(11): np.float64(46.22629

In [3]:

clf = TabNetClassifier(**clf_params)

fit_params = dict(
    X_train=X_train.values,
    y_train=y_train,
    eval_set=[(X_val.values, y_val)],
    eval_name=['Validation'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=16384,
    virtual_batch_size=1024,
    num_workers=0,
    weights=class_weight_dict,
    drop_last=False,
    pin_memory=True
)

clf.fit(**fit_params)





Early stopping occurred at epoch 22 with best_epoch = 12 and best_Validation_accuracy = 0.90968




In [4]:
clf_file_path = 'tabnet_clf.zip'
torch.save(clf, clf_file_path)
print("TabNet Classifier model 'clf' saved successfully!")


TabNet Classifier model 'clf' saved successfully!


In [13]:
import torch

save_folder = 'saved_model' 
clf_0_file_path = os.path.join(save_folder, 'tabnet_clf.zip')
clf_0 = torch.load(clf_0_file_path)
print("Trained model loaded successfully!")

  clf_0 = torch.load(clf_0_file_path)


Trained model loaded successfully!


In [14]:
input_dim = X_train.shape[1]  
output_dim = len(np.unique(y_train)) 

clf_params['input_dim'] = input_dim
clf_params['output_dim'] = output_dim

In [15]:
from pytorch_tabnet.tab_model import TabNetClassifier

new_clf_0 = TabNetClassifier(**clf_params)
print("New TabNetClassifier instance created with correct parameters.")

New TabNetClassifier instance created with correct parameters.


In [16]:
input_dim = X_train.shape[1] 
output_dim = len(np.unique(y_train))  

clf_params['input_dim'] = input_dim
clf_params['output_dim'] = output_dim

clf_params.pop('device_name', None)

from pytorch_tabnet.tab_model import TabNetClassifier

new_clf_0 = TabNetClassifier(**clf_params)
print("New TabNetClassifier instance created with correct parameters.")

New TabNetClassifier instance created with correct parameters.


In [17]:
new_clf_0._set_network()
print("Network is initialized in new_clf_0.")
if not hasattr(clf, 'network'):
    clf_file_path = os.path.join(save_folder, 'tabnet_clf_0.zip')
    clf = torch.load(clf_file_path)
    print("Trained model loaded successfully.")

if hasattr(clf, 'network'):
    print("Network attribute found in clf_0.")
    network = clf.network
else:
    raise AttributeError("No network attribute found in the trained model.")

new_clf_0.network.load_state_dict(network.state_dict())
print("State dictionary loaded into the new model.")

Network is initialized in new_clf_0.
Network attribute found in clf_0.
State dictionary loaded into the new model.


In [18]:
new_clf_0.preds_mapper = clf.preds_mapper
new_clf_0.classes_ = clf.classes_
print("Copied preds_mapper and classes_ to new_clf_0.")

Copied preds_mapper and classes_ to new_clf_0.


In [19]:
clf_0_file_path_new = os.path.join(save_folder, 'tabnet_clf_0')

os.makedirs(save_folder, exist_ok=True)

new_clf_0.save_model(clf_0_file_path_new)
print("TabNet Classifier model saved successfully!")

Successfully saved model at saved_model\tabnet_clf_0.zip
TabNet Classifier model saved successfully!


In [20]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_data['Label'] = df_data['Label'].astype(str)  
y_encoded = label_encoder.fit_transform(df_data['Label'])


In [21]:
import json
import os

save_folder = 'saved_model' 
os.makedirs(save_folder, exist_ok=True)

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

label_mapping = {str(k): int(v) for k, v in label_mapping.items()}

label_mapping_file_path = os.path.join(save_folder, 'label_mapping.json')
with open(label_mapping_file_path, 'w') as f:
    json.dump(label_mapping, f)

print("Label mapping saved successfully!")

scaler_file_path = os.path.join(save_folder, 'scaler.joblib')
joblib.dump(scaler, scaler_file_path)
print("Scaler saved successfully!")

feature_names = list(X_train.columns)
if 'Label' in feature_names:
    feature_names.remove('Label')
    
import json

with open('saved_model/feature_names.json', 'w') as f:
    json.dump(feature_names, f)

clf_params_file_path = os.path.join(save_folder, 'clf_params.pkl')
with open(clf_params_file_path, 'wb') as f:
    pickle.dump(clf_params, f)
print("Classifier parameters saved successfully!")


Label mapping saved successfully!
Scaler saved successfully!
Classifier parameters saved successfully!


In [22]:
label_encoder_file_path = os.path.join(save_folder, 'label_encoder.pkl')
with open(label_encoder_file_path, 'wb') as f:
    pickle.dump(label_encoder, f)
print("LabelEncoder saved successfully!")

LabelEncoder saved successfully!


In [24]:
fit_params_file_path = os.path.join(save_folder, 'fit_params.pkl')
with open(fit_params_file_path, 'wb') as f:
    pickle.dump(fit_params, f)
print("Fit parameters saved successfully!")

from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test.values)

y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test)

print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))

print("Confusion Matrix:")
cm = confusion_matrix(y_test_labels, y_pred_labels)
print(cm)

Fit parameters saved successfully!
Classification Report:
                            precision    recall  f1-score   support

                    Benign       1.00      0.90      0.95    284298
                       Bot       0.05      0.82      0.10       215
                      DDoS       0.98      0.93      0.95     19203
             DoS GoldenEye       0.72      0.99      0.83      1543
                  DoS Hulk       0.87      0.98      0.93     25927
          DoS Slowhttptest       0.56      0.98      0.71       784
             DoS slowloris       0.51      0.95      0.66       808
               FTP-Patator       0.23      0.99      0.37       890
                Heartbleed       0.50      1.00      0.67         2
              Infiltration       0.01      0.80      0.01         5
                  PortScan       0.09      0.76      0.16       293
               SSH-Patator       0.35      0.93      0.51       483
  Web Attack � Brute Force       0.00      0.17      0.01