In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

from huggingface_hub import login,HfApi, upload_file



from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

import time



In [None]:
df = pd.read_csv('E:/masters material/thesis/datasets/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv', low_memory=False)

In [None]:
print(df['Attack_type'].value_counts())

In [None]:
print(df.columns)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
print(df.isnull().sum())

In [None]:
print(df.duplicated().sum())

# decision tree classifier

In [None]:
# Drop rows with missing values
df.dropna(inplace=True)

# Step 1: Drop object columns
X = df.drop(columns=['Attack_label'])  # Drop target and any duplicates
X = X.select_dtypes(include=['int64', 'float64', 'bool'])  # Keep numeric features only

# Step 2: Set target
y = df['Attack_label']

# Train-val-test split: 70/20/10
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, stratify=y_temp, random_state=42)


In [None]:
print("Train class distribution:\n", y_train.value_counts())
print("Validation class distribution:\n", y_val.value_counts())
print("Test class distribution:\n", y_test.value_counts())

In [None]:
print("X shape:", X.shape)
print("X columns:", X.columns.tolist())

In [None]:



model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,       # Disable the label encoder warning
    objective='binary:logistic',   # Important: binary classification objective
    eval_metric='logloss'          # Evaluation metric
)

# Start timer
start_time = time.time()

model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True  # Optional: shows training log
)

# End timer
end_time = time.time()

model.save_model("xgb_model.json")

# upload_file(
#     path_or_fileobj="xgb_model.json",  # or "xgb_model.pkl"
#     path_in_repo="xgb_model.json",     # File name in the repo
#     repo_id="ScHemer34/DT_XGBoost",
#     repo_type="model"
# )


In [None]:
y_pred = model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['normal', 'attack']))

# Print duration
training_duration = end_time - start_time
print(f"\n✅ Model trained in {training_duration:.2f} seconds")

# Feed Forward Neural Network

In [2]:
df_fnn = pd.read_csv("E:/masters material/thesis/datasets/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv")  # adjust path
print(df_fnn.shape)
df_fnn.info()



  df_fnn = pd.read_csv("E:/masters material/thesis/datasets/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv")  # adjust path


(2219201, 63)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2219201 entries, 0 to 2219200
Data columns (total 63 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   frame.time                 object 
 1   ip.src_host                object 
 2   ip.dst_host                object 
 3   arp.dst.proto_ipv4         object 
 4   arp.opcode                 float64
 5   arp.hw.size                float64
 6   arp.src.proto_ipv4         object 
 7   icmp.checksum              float64
 8   icmp.seq_le                float64
 9   icmp.transmit_timestamp    float64
 10  icmp.unused                float64
 11  http.file_data             object 
 12  http.content_length        float64
 13  http.request.uri.query     object 
 14  http.request.method        object 
 15  http.referer               object 
 16  http.request.full_uri      object 
 17  http.request.version       object 
 18  http.response              float64
 19  http.tls_port              f

In [3]:
import torch
import torch.nn as nn


class LinearNN(nn.Module):
    def __init__(self, input_size):
        super(LinearNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 16)
        self.relu1 = nn.LeakyReLU(negative_slope=0.01)
        self.fc2 = nn.Linear(16, 8)
        self.relu2 = nn.LeakyReLU(negative_slope=0.01)
        self.output = nn.Linear(8, 1)  # Output = 1 for binary classification

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.output(x)  # No sigmoid here if using BCEWithLogitsLoss
        return x

In [4]:
# Drop rows with missing values
df_fnn.dropna(inplace=True)

#encoding important columns
# Initialize encoder
method_encoder = LabelEncoder()

# Fit and transform
df_fnn['http.request.method_encoded'] = method_encoder.fit_transform(df_fnn['http.request.method'].astype(str))
df_fnn['http.request.version_encoded'] = np.where(df_fnn['http.request.version'].astype(str).str.strip() == '0', 0, 1)
df_fnn['mqtt_topic'] = method_encoder.fit_transform(df_fnn['mqtt.topic'].astype(str))
df_fnn['mqtt_protoname'] = method_encoder.fit_transform(df_fnn['mqtt.protoname'].astype(str))
df_fnn['Attack_type'] = np.where(df_fnn['Attack_type'].astype(str).str.strip() == 'normal', 0, 1)


# Step 2: Set target
y_fnn = df_fnn['Attack_label']

# Now drop object and unnecessary columns
X_fnn = df_fnn.drop(columns=[
    'Attack_label', 'http.request.full_uri', 'http.referer', 'http.file_data', 
    'tcp.payload', 'frame.time', 'mqtt.msg', 'tcp.options', 'dns.qry.name', 
    'http.request.method', 'http.request.version', 'mqtt.topic', 'mqtt.protoname','ip.src_host',
    'ip.dst_host','arp.dst.proto_ipv4','arp.src.proto_ipv4','http.request.uri.query','tcp.srcport',
    'dns.qry.name.len','mqtt.conack.flags'
])




# Train-val-test split: 70/20/10
X_train_fnn, X_temp_fnn, y_train_fnn, y_temp_fnn = train_test_split(X_fnn, y_fnn, test_size=0.3, stratify=y_fnn, random_state=42)
X_val_fnn, X_test_fnn, y_val_fnn, y_test_fnn = train_test_split(X_temp_fnn, y_temp_fnn, test_size=1/3, stratify=y_temp_fnn, random_state=42)











In [None]:
print("Train class distribution:\n", y_train_fnn.value_counts())
print("Validation class distribution:\n", y_val_fnn.value_counts())
print("Test class distribution:\n", y_test_fnn.value_counts())

In [None]:
for col in X_fnn.columns:
    if X_fnn[col].apply(type).nunique() > 1:
        print(f"{col}: {X_fnn[col].apply(type).value_counts()}")

In [None]:
X_fnn.info()


In [5]:
#normalize the data
# scaler = StandardScaler()
# X_train_fnn = scaler.fit_transform(X_train_fnn)
# X_val_fnn = scaler.transform(X_val_fnn)
# X_test_fnn = scaler.transform(X_test_fnn)

print(X_fnn.dtypes[X_fnn.dtypes == 'object'])

# convert dataframe to pytorch tensors
X_train_tensor_fnn = torch.tensor(X_train_fnn.to_numpy(), dtype=torch.float32)
y_train_tensor_fnn = torch.tensor(y_train_fnn.to_numpy(), dtype=torch.float32)

X_val_tensor_fnn = torch.tensor(X_val_fnn.to_numpy(), dtype=torch.float32)
y_val_tensor_fnn = torch.tensor(y_val_fnn.to_numpy(), dtype=torch.float32)

Series([], dtype: object)


In [6]:
# Start timer
start_time_fnn = time.time()

model_fnn = LinearNN(input_size=X_train_fnn.shape[1])
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_fnn.parameters(), lr=0.0001)

num_epochs = 30
for epoch in range(num_epochs):
    model_fnn.train()
    
    outputs = model_fnn(X_train_tensor_fnn).squeeze()
    loss = criterion(outputs, y_train_tensor_fnn.float())

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Calculate validation loss
    model_fnn.eval()  # Set model to evaluation mode
    with torch.no_grad():  # No need to track gradients during validation
        val_outputs = model_fnn(X_val_tensor_fnn).squeeze()
        val_loss = criterion(val_outputs, y_val_tensor_fnn.float())

    # Print epoch information
    model_fnn.train()  # Set model back to training mode
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")


# End timer
end_time_fnn = time.time()

torch.save(model_fnn.state_dict(), "linear_nn_model.pth")

Epoch [1/30], Loss: 804007.5625, Val Loss: 759611.0000
Epoch [2/30], Loss: 760345.2500, Val Loss: 716059.6250
Epoch [3/30], Loss: 716740.2500, Val Loss: 672566.6250
Epoch [4/30], Loss: 673193.7500, Val Loss: 629131.5000
Epoch [5/30], Loss: 629705.3125, Val Loss: 585756.5625
Epoch [6/30], Loss: 586277.1250, Val Loss: 542441.3125
Epoch [7/30], Loss: 542908.5625, Val Loss: 499186.9375
Epoch [8/30], Loss: 499600.9062, Val Loss: 455993.2812
Epoch [9/30], Loss: 456354.2188, Val Loss: 412861.1875
Epoch [10/30], Loss: 413169.0312, Val Loss: 369790.0938
Epoch [11/30], Loss: 370044.8125, Val Loss: 326780.6562
Epoch [12/30], Loss: 326982.3125, Val Loss: 283833.3125
Epoch [13/30], Loss: 283982.0312, Val Loss: 240945.5625
Epoch [14/30], Loss: 241041.3281, Val Loss: 198118.7031
Epoch [15/30], Loss: 198161.5469, Val Loss: 155351.0000
Epoch [16/30], Loss: 155340.9844, Val Loss: 112641.6484
Epoch [17/30], Loss: 112578.8359, Val Loss: 70058.0391
Epoch [18/30], Loss: 69943.9531, Val Loss: 27966.6367
Epoc

In [7]:
model_fnn.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_fnn.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test_fnn.values, dtype=torch.float32)

    outputs = model_fnn(X_test_tensor).squeeze()
    probs = torch.sigmoid(outputs)
    preds = (probs > 0.5).float()

    correct = (preds == y_test_tensor).sum().item()
    accuracy = correct / y_test_tensor.shape[0]

print(f"\n🧪 Test Accuracy: {accuracy:.4f}")

# Print duration
training_duration_fnn = end_time_fnn - start_time_fnn
print(f"⏱️ Model trained in {training_duration_fnn:.2f} seconds")


🧪 Test Accuracy: 0.7619
⏱️ Model trained in 12.33 seconds
