In [1]:
import pandas as pd
import os
import math
from datetime import datetime
import numpy as np
from numpy import argmax

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import sys

from imblearn.over_sampling import KMeansSMOTE, ADASYN,SMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from time import time
from sklearn.metrics import classification_report,matthews_corrcoef
from copy import deepcopy
from sklearn.utils import shuffle

# path to the dataset
# file = 'Shaleeza.Dataset.v1.csv'
file = 'IoT Network Intrusion Dataset.csv'
data = pd.read_csv(file)

In [2]:
def data_preprocessing(targets_others):
    dataset = pd.read_csv(file, error_bad_lines=False, low_memory=False)
    dataset = dataset.drop(['Flow_ID', 'Src_IP', 'Dst_IP', 'Dst_Port', 'Protocol'], axis=1)
    dataset = dataset.drop(['Timestamp'], axis=1)

    # contain only single values
    dataset = dataset.drop(
        ['Fwd_PSH_Flags', 'Fwd_URG_Flags', 'Fwd_Byts/b_Avg', 'Fwd_Pkts/b_Avg', 'Fwd_Blk_Rate_Avg', 'Bwd_Byts/b_Avg',
         'Bwd_Pkts/b_Avg', 'Bwd_Blk_Rate_Avg', 'Init_Fwd_Win_Byts', 'Fwd_Seg_Size_Min'], axis=1)

    dataset['Flow_Byts/s'] = round(dataset['Flow_Byts/s'],2)

    dataset = dataset.drop(targets_others, axis=1)

    dataset = dataset.reset_index()
    dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataset.dropna(inplace=True)

    # correlation
    correlated_features = set()
    correlation_matrix = dataset.corr()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) >= 0.7:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    dataset.drop(labels=correlated_features, axis=1, inplace=True)

    return dataset

def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = deepcopy(data[:,-1])
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = ADASYN().fit_resample(train[:,:-1], train[:,-1])
        predictY[begin:end] = model.fit(X_train, y_train).predict(test[:,:-1])
    t = classification_report(data[:,-1], predictY)
    print(t)
    print(matthews_corrcoef(data[:,-1], predictY))
        
#         results = results + getResults(model, train[:,:-1], train[:,-1],test[:,:-1],test[:,-1])
#     return results/n_fold

def getResults(model, X_train, y_train,X_test,y):
    predictY = model.fit(X_train, y_train).predict(X_test)
    t = classification_report(y, predictY)#, target_names=['0', '1', '2']
    return t


In [3]:
data = data_preprocessing(['Label', 'Cat'])
data = data.values
data = data[:,1:]
X_train, y_train = data[:,:-1], data[:,-1]
y_train = LabelEncoder().fit_transform(y_train)



  dataset = pd.read_csv(file, error_bad_lines=False, low_memory=False)


In [6]:
t = time()
length = [5]
for i in length:
    model = DecisionTreeClassifier(max_depth=i)
    kFoldCV(model, data)


print(time()-t)

                       precision    recall  f1-score   support

      DoS-Synflooding       1.00      0.99      1.00     59391
    MITM ARP Spoofing       0.39      0.65      0.49     35377
    Mirai-Ackflooding       0.28      0.08      0.13     55124
  Mirai-HTTP Flooding       0.29      0.75      0.41     55818
Mirai-Hostbruteforceg       0.93      0.40      0.56    121178
   Mirai-UDP Flooding       1.00      0.70      0.82    183189
               Normal       0.37      0.98      0.54     40073
        Scan Hostport       0.25      0.53      0.34     22192
         Scan Port OS       0.67      0.15      0.24     53073

             accuracy                           0.58    625415
            macro avg       0.57      0.58      0.50    625415
         weighted avg       0.73      0.58      0.59    625415

0.5389745785336749
688.3944842815399


In [4]:
t = time()
hidden_layer_sizes  = [300]
max_iter = [300]
for i in hidden_layer_sizes:
    for j in max_iter:
        model = MLPClassifier(hidden_layer_sizes=i, max_iter=j)
        kFoldCV(model, data)


print(time()-t)

                       precision    recall  f1-score   support

      DoS-Synflooding       0.98      1.00      0.99     59391
    MITM ARP Spoofing       0.55      0.05      0.10     35377
    Mirai-Ackflooding       0.29      0.01      0.01     55124
  Mirai-HTTP Flooding       0.10      0.10      0.10     55818
Mirai-Hostbruteforceg       0.22      0.41      0.29    121178
   Mirai-UDP Flooding       0.38      0.39      0.38    183189
               Normal       0.77      0.15      0.25     40073
        Scan Hostport       0.10      0.06      0.08     22192
         Scan Port OS       0.13      0.17      0.15     53073

             accuracy                           0.33    625415
            macro avg       0.39      0.26      0.26    625415
         weighted avg       0.38      0.33      0.31    625415

0.17698891475868056
29981.258207321167


In [5]:
n_estimators  = [100]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = RandomForestClassifier(n_estimators=i, max_depth=j)
        kFoldCV(model, data)

print(time()-t)


                       precision    recall  f1-score   support

      DoS-Synflooding       0.99      1.00      1.00     59391
    MITM ARP Spoofing       0.34      0.56      0.42     35377
    Mirai-Ackflooding       0.32      0.43      0.37     55124
  Mirai-HTTP Flooding       0.30      0.38      0.33     55818
Mirai-Hostbruteforceg       0.84      0.36      0.50    121178
   Mirai-UDP Flooding       0.98      0.71      0.82    183189
               Normal       0.76      0.85      0.80     40073
        Scan Hostport       0.19      0.30      0.23     22192
         Scan Port OS       0.34      0.62      0.44     53073

             accuracy                           0.59    625415
            macro avg       0.56      0.58      0.55    625415
         weighted avg       0.70      0.59      0.61    625415

0.5384163413721943
32778.73363780975


In [7]:
def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = deepcopy(data[:,-1])
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = ADASYN().fit_resample(train[:,:-1], train[:,-1].astype(int))
#         print(train[:,-1].shape)
#         y_train = y_train.astype('int').reshape(len(X_train),1)
        model.fit(X_train, y_train)
        predictY[begin:end] = model.predict(test[:,:-1])
#         predictY[begin:end] = model.fit(train[:,:-1], train[:,-1].astype("int")).predict(test[:,:-1])
    t = classification_report(data[:,-1].astype("int"), predictY.astype("int"))
    print(t)
    print(matthews_corrcoef(data[:,-1].astype("int"), predictY.astype("int")))

In [9]:

data = np.c_[X_train,y_train]
t = time()
n_estimators  = [200]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = XGBClassifier(n_estimators=i, max_depth=j,objective='mlogloss')
        kFoldCV(model, data)


print(time()-t)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59391
           1       0.85      0.99      0.91     35377
           2       0.30      0.42      0.35     55124
           3       0.31      0.46      0.37     55818
           4       0.94      0.86      0.90    121178
           5       1.00      0.70      0.82    183189
           6       0.81      0.97      0.88     40073
           7       0.41      0.70      0.52     22192
           8       0.79      0.56      0.65     53073

    accuracy                           0.74    625415
   macro avg       0.71      0.74      0.71    625415
weighted avg       0.80      0.74      0.76    625415

0.6973697493699919
8312.03058385849


In [13]:
def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = deepcopy(data[:,-1]).reshape(len(data[:,-1]),1)
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = ADASYN().fit_resample(train[:,:-1], train[:,-1].astype(int))
#         print(train[:,-1].shape)
#         y_train = y_train.astype('int').reshape(len(X_train),1)
        model.fit(X_train, y_train)
        predictY[begin:end] = model.predict(test[:,:-1]).reshape(len(test[:,-1]),1)
#         predictY[begin:end] = model.fit(train[:,:-1], train[:,-1].astype("int")).predict(test[:,:-1])
    t = classification_report(data[:,-1].astype("int"), predictY.astype("int"))
    print(t)
    print(matthews_corrcoef(data[:,-1].astype("int"), predictY.astype("int")))

In [14]:
t = time()
n_estimators  = [10, 100, 200]
max_depth = [3, 4, 5]
n_estimators  = [200]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = CatBoostClassifier(n_estimators=i, max_depth=j,verbose=False)
        kFoldCV(model, data)

print(time()-t)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     59391
           1       0.73      0.98      0.84     35377
           2       0.32      0.41      0.36     55124
           3       0.32      0.50      0.39     55818
           4       0.92      0.81      0.86    121178
           5       1.00      0.70      0.82    183189
           6       0.79      0.96      0.87     40073
           7       0.39      0.70      0.50     22192
           8       0.77      0.52      0.62     53073

    accuracy                           0.72    625415
   macro avg       0.69      0.73      0.70    625415
weighted avg       0.79      0.72      0.74    625415

0.6848928639126862
3651.2022938728333
