In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

import time



In [2]:
df = pd.read_csv('E:/masters material/thesis/datasets/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv', low_memory=False)

In [3]:
print(df['Attack_type'].value_counts())

Attack_type
Normal                   24301
DDoS_UDP                 14498
DDoS_ICMP                14090
Ransomware               10925
DDoS_HTTP                10561
SQL_injection            10311
Uploading                10269
DDoS_TCP                 10247
Backdoor                 10195
Vulnerability_scanner    10076
Port_Scanning            10071
XSS                      10052
Password                  9989
MITM                      1214
Fingerprinting            1001
Name: count, dtype: int64


In [7]:
print(df.columns)

Index(['frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4',
       'arp.opcode', 'arp.hw.size', 'arp.src.proto_ipv4', 'icmp.checksum',
       'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused',
       'http.file_data', 'http.content_length', 'http.request.uri.query',
       'http.request.method', 'http.referer', 'http.request.full_uri',
       'http.request.version', 'http.response', 'http.tls_port', 'tcp.ack',
       'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.options',
       'tcp.payload', 'tcp.seq', 'tcp.srcport', 'udp.port', 'udp.stream',
       'udp.time_delta', 'dns.qry.name', 'dns.qry.name.len', 'dns.qry.qu',
       'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request',
       'dns.retransmit_request_in', 'mqtt.conack.flags',
       'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157800 entries, 0 to 157799
Data columns (total 63 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   frame.time                 157800 non-null  object 
 1   ip.src_host                157800 non-null  object 
 2   ip.dst_host                157800 non-null  object 
 3   arp.dst.proto_ipv4         157800 non-null  object 
 4   arp.opcode                 157800 non-null  float64
 5   arp.hw.size                157800 non-null  float64
 6   arp.src.proto_ipv4         157800 non-null  object 
 7   icmp.checksum              157800 non-null  float64
 8   icmp.seq_le                157800 non-null  float64
 9   icmp.transmit_timestamp    157800 non-null  float64
 10  icmp.unused                157800 non-null  float64
 11  http.file_data             157800 non-null  object 
 12  http.content_length        157800 non-null  float64
 13  http.request.uri.query     15

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
print(df.isnull().sum())

In [None]:
print(df.duplicated().sum())

In [13]:
# Drop rows with missing values
df.dropna(inplace=True)

# Step 1: Drop object columns
X = df.drop(columns=['Attack_label', 'label'])  # Drop target and any duplicates
X = X.select_dtypes(include=['int64', 'float64', 'bool'])  # Keep numeric features only

# Step 2: Set target
y = df['Attack_label']

# Train-val-test split: 70/20/10
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, stratify=y_temp, random_state=42)


In [15]:
print("Train class distribution:\n", y_train.value_counts())
print("Validation class distribution:\n", y_val.value_counts())
print("Test class distribution:\n", y_test.value_counts())

Train class distribution:
 Attack_label
1    93449
0    17011
Name: count, dtype: int64
Validation class distribution:
 Attack_label
1    26700
0     4860
Name: count, dtype: int64
Test class distribution:
 Attack_label
1    13350
0     2430
Name: count, dtype: int64


In [16]:
print("X shape:", X.shape)
print("X columns:", X.columns.tolist())

X shape: (157800, 42)
X columns: ['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.port', 'udp.stream', 'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']


In [18]:



model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,       # Disable the label encoder warning
    objective='binary:logistic',   # Important: binary classification objective
    eval_metric='logloss'          # Evaluation metric
)

# Start timer
start_time = time.time()

model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True  # Optional: shows training log
)

# End timer
end_time = time.time()


[0]	validation_0-logloss:0.36027
[1]	validation_0-logloss:0.30288
[2]	validation_0-logloss:0.26480
[3]	validation_0-logloss:0.23472
[4]	validation_0-logloss:0.20795
[5]	validation_0-logloss:0.18535
[6]	validation_0-logloss:0.16661
[7]	validation_0-logloss:0.15046
[8]	validation_0-logloss:0.13652
[9]	validation_0-logloss:0.12434
[10]	validation_0-logloss:0.11420
[11]	validation_0-logloss:0.10467
[12]	validation_0-logloss:0.09672
[13]	validation_0-logloss:0.08892
[14]	validation_0-logloss:0.08203
[15]	validation_0-logloss:0.07593
[16]	validation_0-logloss:0.07054
[17]	validation_0-logloss:0.06572
[18]	validation_0-logloss:0.06141
[19]	validation_0-logloss:0.05726
[20]	validation_0-logloss:0.05350
[21]	validation_0-logloss:0.05013


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[22]	validation_0-logloss:0.04712
[23]	validation_0-logloss:0.04446
[24]	validation_0-logloss:0.04198
[25]	validation_0-logloss:0.03990
[26]	validation_0-logloss:0.03789
[27]	validation_0-logloss:0.03622
[28]	validation_0-logloss:0.03440
[29]	validation_0-logloss:0.03275
[30]	validation_0-logloss:0.03149
[31]	validation_0-logloss:0.03033
[32]	validation_0-logloss:0.02919
[33]	validation_0-logloss:0.02804
[34]	validation_0-logloss:0.02751
[35]	validation_0-logloss:0.02660
[36]	validation_0-logloss:0.02582
[37]	validation_0-logloss:0.02491
[38]	validation_0-logloss:0.02432
[39]	validation_0-logloss:0.02364
[40]	validation_0-logloss:0.02305
[41]	validation_0-logloss:0.02249
[42]	validation_0-logloss:0.02189
[43]	validation_0-logloss:0.02137
[44]	validation_0-logloss:0.02091
[45]	validation_0-logloss:0.02050
[46]	validation_0-logloss:0.02000
[47]	validation_0-logloss:0.01962
[48]	validation_0-logloss:0.01915
[49]	validation_0-logloss:0.01883
[50]	validation_0-logloss:0.01837
[51]	validatio

In [20]:
y_pred = model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['normal', 'attack']))

# Print duration
training_duration = end_time - start_time
print(f"\n✅ Model trained in {training_duration:.2f} seconds")

Test Accuracy: 0.994169835234474
              precision    recall  f1-score   support

      normal       0.97      0.99      0.98      2430
      attack       1.00      1.00      1.00     13350

    accuracy                           0.99     15780
   macro avg       0.99      0.99      0.99     15780
weighted avg       0.99      0.99      0.99     15780


✅ Model trained in 0.97 seconds
