In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [2]:
#Load & Preprocess UNSW-NB15_4 Dataset
df = pd.read_csv('C:/Users/Artophilic/Datascience Bootcamp/Network_research/Dataset/UNSW-NB15_4.csv')

In [3]:
df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [4]:
df = df.drop(columns=["proto","service","state","attack_cat"])

In [5]:
df.isnull().sum()

id                   0
dur                  0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
label                0
dtype: int64

In [6]:
#Splitting the data into independent and dependent data
X = df.drop(columns=['label']).copy()
X.head()
y = df['label'].copy()
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [7]:
#Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,stratify=y)

In [8]:
# Convert to binary labels: 1 for normal, -1 for anomaly
y = y.replace({0: 1, 1: -1})

In [9]:
#Standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from sklearn.preprocessing import KBinsDiscretizer

# Use KBinsDiscretizer to bin continuous values
binner = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')  # or 'quantile'
X_train_bin = binner.fit_transform(X_train)
X_test_bin = binner.transform(X_test)

X_train_bin = X_train_bin.astype(int)
X_test_bin = X_test_bin.astype(int)

In [12]:
#Defining number of batches to pu forward in partial fitting
n_chunks = 50
chunk_size = len(X_train_bin) // n_chunks 

In [13]:
#Breaking into chunks
X_chunks = [X_train_bin[i:i + chunk_size] for i in range(0, len(X_train_bin), chunk_size)]
y_chunks = [y_train.values[i:i + chunk_size] for i in range(0, len(X_train_bin), chunk_size)]

In [14]:
#Creating model
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [15]:
# Initialize model with a loss for classification
online_model = SGDClassifier(loss='log_loss', max_iter=1, warm_start=True, random_state=42)

In [16]:
#First we need to pass the classes to partial_fit
classes = np.unique(y_train)

In [17]:
#Fitting loop
for i in range(len(X_chunks)):
    if i == 0:
        online_model.partial_fit(X_chunks[i], y_chunks[i], classes=classes)
    else:
        online_model.partial_fit(X_chunks[i], y_chunks[i])

    # tracking accuracy on test set during streaming
    if (i + 1) % 10 == 0:
        y_pred = online_model.predict(X_test_bin)
        acc = accuracy_score(y_test, y_pred)
        print(f"After chunk {i + 1}/{n_chunks} → Accuracy: {acc:.4f}")

After chunk 10/50 → Accuracy: 0.9254
After chunk 20/50 → Accuracy: 0.9023
After chunk 30/50 → Accuracy: 0.9095
After chunk 40/50 → Accuracy: 0.9085
After chunk 50/50 → Accuracy: 0.9083


In [None]:
#Final evaluation
final_pred = online_model.predict(X_test_bin)
print("\nFinal Classification Report:\n", classification_report(y_test, final_pred))