In [1]:
''' 
Inspired by the paper Boosting over non-deterministic ZDDs” 
by Takahiro Fujita, Kohei Hatano, and Eiji Takimoto.

Trying to implement the approach in the XGBoost version implemented earlier
'''

' \nInspired by the paper Boosting over non-deterministic ZDDs” \nby Takahiro Fujita, Kohei Hatano, and Eiji Takimoto.\n\nTrying to implement the approach in the XGBoost version implemented earlier\n'

In [3]:
#Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [5]:
#Load & Preprocess UNSW-NB15_4 Dataset
df = pd.read_csv('C:/Users/Artophilic/Datascience Bootcamp/Network_research/Dataset/UNSW-NB15_4.csv')

In [6]:
df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [7]:
df = df.drop(columns=["proto","service","state","attack_cat"])

In [8]:
df.head()

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
0,1,0.121478,6,4,258,172,74.08749,252,254,14158.94238,...,1,1,1,0,0,0,1,1,0,0
1,2,0.649902,14,38,734,42014,78.473372,62,252,8395.112305,...,1,1,2,0,0,0,1,6,0,0
2,3,1.623129,8,16,364,13186,14.170161,62,252,1572.271851,...,1,1,3,0,0,0,2,6,0,0
3,4,1.681642,12,12,628,770,13.677108,62,252,2740.178955,...,1,1,3,1,1,0,2,1,0,0
4,5,0.449454,10,6,534,268,33.373826,254,252,8561.499023,...,2,1,40,0,0,0,2,39,0,0


In [9]:
df.isnull().sum()

id                   0
dur                  0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
label                0
dtype: int64

In [10]:
df.dtypes

id                     int64
dur                  float64
spkts                  int64
dpkts                  int64
sbytes                 int64
dbytes                 int64
rate                 float64
sttl                   int64
dttl                   int64
sload                float64
dload                float64
sloss                  int64
dloss                  int64
sinpkt               float64
dinpkt               float64
sjit                 float64
djit                 float64
swin                   int64
stcpb                  int64
dtcpb                  int64
dwin                   int64
tcprtt               float64
synack               float64
ackdat               float64
smean                  int64
dmean                  int64
trans_depth            int64
response_body_len      int64
ct_srv_src             int64
ct_state_ttl           int64
ct_dst_ltm             int64
ct_src_dport_ltm       int64
ct_dst_sport_ltm       int64
ct_dst_src_ltm         int64
is_ftp_login  

In [11]:
#Splitting the data into independent and dependent data

X = df.drop(columns=['label']).copy()
X.head()


Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,1,0.121478,6,4,258,172,74.08749,252,254,14158.94238,...,1,1,1,1,0,0,0,1,1,0
1,2,0.649902,14,38,734,42014,78.473372,62,252,8395.112305,...,1,1,1,2,0,0,0,1,6,0
2,3,1.623129,8,16,364,13186,14.170161,62,252,1572.271851,...,2,1,1,3,0,0,0,2,6,0
3,4,1.681642,12,12,628,770,13.677108,62,252,2740.178955,...,2,1,1,3,1,1,0,2,1,0
4,5,0.449454,10,6,534,268,33.373826,254,252,8561.499023,...,2,2,1,40,0,0,0,2,39,0


In [12]:
y = df['label'].copy()
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [13]:
#Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,stratify=y)

In [14]:
# Convert to binary labels: 1 for normal, -1 for anomaly
y = y.replace({0: 1, 1: -1})

In [16]:
#Standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#NZDD (Non-Deterministic Zero-Suppressed Binary Decision Diagram) structure

In [18]:
''' 
convert numerical features into binary values (e.g., using thresholding or binning) 
to simulate decisions (as ZDDs are boolean-based).
'''
from sklearn.preprocessing import KBinsDiscretizer

# Use KBinsDiscretizer to bin continuous values
binner = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')  # or 'quantile'
X_train_bin = binner.fit_transform(X_train)
X_test_bin = binner.transform(X_test)

X_train_bin = X_train_bin.astype(int)
X_test_bin = X_test_bin.astype(int)


In [None]:
#Building path based structure like NZDD
from collections import defaultdict

def build_nzdd_structure(X_bin, y):
    nzdd = defaultdict(list)
    for i, row in enumerate(X_bin):
        path = tuple(row)  # Simulate path as a tuple of binary features
        nzdd[path].append(i)  # Append sample index to that path
    return nzdd

nzdd_train = build_nzdd_structure(X_train_bin, y_train.values)

In [20]:
nzdd_train

defaultdict(list,
            {(np.int64(1),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(1),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
              np.int64(0),
          

In [24]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import time

# Use decision stumps as weak learners
base_estimator = DecisionTreeClassifier(max_depth=1)

# Initialize AdaBoost
ada_model = AdaBoostClassifier(n_estimators=50, random_state=42,estimator=DecisionTreeClassifier(max_depth=1))

# Time the training
start = time.time()
ada_model.fit(X_train_bin, y_train)
end = time.time()
training_time = end - start

# Predictions
y_pred = ada_model.predict(X_test_bin)

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Training Time: {training_time:.2f} seconds")


Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.80      0.84     14000
           1       0.91      0.94      0.93     29836

    accuracy                           0.90     43836
   macro avg       0.89      0.87      0.88     43836
weighted avg       0.90      0.90      0.90     43836

Accuracy: 0.8996
Training Time: 3.26 seconds


In [None]:
#The recall for anomaly is high 0.94