In [9]:
import pandas as pd
import os
import math
from datetime import datetime
import numpy as np
from numpy import argmax

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import sys

from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from time import time
from sklearn.metrics import classification_report,matthews_corrcoef
from copy import deepcopy
from sklearn.utils import shuffle

# path to the dataset
# file = 'Shaleeza.Dataset.v1.csv'
file = 'IoT Network Intrusion Dataset.csv'
data = pd.read_csv(file)

In [10]:
def data_preprocessing(targets_others):
    dataset = pd.read_csv(file, error_bad_lines=False, low_memory=False)
    dataset = dataset.drop(['Flow_ID', 'Src_IP', 'Dst_IP', 'Dst_Port', 'Protocol'], axis=1)
    dataset = dataset.drop(['Timestamp'], axis=1)

    # contain only single values
    dataset = dataset.drop(
        ['Fwd_PSH_Flags', 'Fwd_URG_Flags', 'Fwd_Byts/b_Avg', 'Fwd_Pkts/b_Avg', 'Fwd_Blk_Rate_Avg', 'Bwd_Byts/b_Avg',
         'Bwd_Pkts/b_Avg', 'Bwd_Blk_Rate_Avg', 'Init_Fwd_Win_Byts', 'Fwd_Seg_Size_Min'], axis=1)

    dataset['Flow_Byts/s'] = round(dataset['Flow_Byts/s'],2)

    dataset = dataset.drop(targets_others, axis=1)

    dataset = dataset.reset_index()
    dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataset.dropna(inplace=True)

    # correlation
    correlated_features = set()
    correlation_matrix = dataset.corr()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) >= 0.7:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    dataset.drop(labels=correlated_features, axis=1, inplace=True)

    return dataset

def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = deepcopy(data[:,-1])
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = RandomUnderSampler().fit_resample(train[:,:-1], train[:,-1])
        predictY[begin:end] = model.fit(X_train, y_train).predict(test[:,:-1])
    t = classification_report(data[:,-1], predictY)
    print(t)
    print(matthews_corrcoef(data[:,-1], predictY))
        
#         results = results + getResults(model, train[:,:-1], train[:,-1],test[:,:-1],test[:,-1])
#     return results/n_fold

def getResults(model, X_train, y_train,X_test,y):
    predictY = model.fit(X_train, y_train).predict(X_test)
    t = classification_report(y, predictY)#, target_names=['0', '1', '2']
    return t


In [23]:
data = data_preprocessing(['Label', 'Cat'])
data = data.values
data = data[:,1:]
X_train, y_train = data[:,:-1], data[:,-1]
y_train = LabelEncoder().fit_transform(y_train)
data = np.c_[X_train,y_train]



  dataset = pd.read_csv(file, error_bad_lines=False, low_memory=False)


In [12]:
t = time()
length = [5]
for i in length:
    model = DecisionTreeClassifier(max_depth=i)
    kFoldCV(model, data)


print(time()-t)

                       precision    recall  f1-score   support

      DoS-Synflooding       1.00      1.00      1.00     59391
    MITM ARP Spoofing       0.30      0.45      0.36     35377
    Mirai-Ackflooding       0.34      0.00      0.00     55124
  Mirai-HTTP Flooding       0.25      0.85      0.39     55818
Mirai-Hostbruteforceg       0.64      0.31      0.42    121178
   Mirai-UDP Flooding       1.00      0.59      0.74    183189
               Normal       0.98      0.63      0.77     40073
        Scan Hostport       0.17      0.06      0.08     22192
         Scan Port OS       0.40      0.93      0.56     53073

             accuracy                           0.55    625415
            macro avg       0.56      0.54      0.48    625415
         weighted avg       0.68      0.55      0.55    625415

0.5099452409831822
58.52631759643555


In [13]:
t = time()
hidden_layer_sizes  = [300]
max_iter = [300]
for i in hidden_layer_sizes:
    for j in max_iter:
        model = MLPClassifier(hidden_layer_sizes=i, max_iter=j)
        kFoldCV(model, data)


print(time()-t)

                       precision    recall  f1-score   support

      DoS-Synflooding       1.00      1.00      1.00     59391
    MITM ARP Spoofing       0.60      0.17      0.26     35377
    Mirai-Ackflooding       0.33      0.02      0.03     55124
  Mirai-HTTP Flooding       0.25      0.00      0.01     55818
Mirai-Hostbruteforceg       0.32      0.01      0.02    121178
   Mirai-UDP Flooding       0.37      0.88      0.52    183189
               Normal       0.79      0.21      0.33     40073
        Scan Hostport       0.11      0.12      0.12     22192
         Scan Port OS       0.17      0.26      0.20     53073

             accuracy                           0.41    625415
            macro avg       0.44      0.30      0.28    625415
         weighted avg       0.42      0.41      0.31    625415

0.27702615960593185
1631.2587668895721


In [None]:
t = time()
Cs = [10, 100, 1000]
gammas = [0.01, 0.1,1]
for i in Cs:
    for j in gammas:
        model = SVC(kernel = 'rbf', C = i, gamma = j)
        kFoldCV(model, data)

print(time()-t)


In [14]:
n_estimators  = [200]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = RandomForestClassifier(n_estimators=i, max_depth=j)
        kFoldCV(model, data)

print(time()-t)


                       precision    recall  f1-score   support

      DoS-Synflooding       1.00      1.00      1.00     59391
    MITM ARP Spoofing       0.32      0.88      0.47     35377
    Mirai-Ackflooding       0.30      0.36      0.33     55124
  Mirai-HTTP Flooding       0.30      0.49      0.37     55818
Mirai-Hostbruteforceg       0.89      0.21      0.35    121178
   Mirai-UDP Flooding       0.99      0.71      0.82    183189
               Normal       0.81      0.85      0.83     40073
        Scan Hostport       0.17      0.12      0.14     22192
         Scan Port OS       0.46      0.82      0.59     53073

             accuracy                           0.60    625415
            macro avg       0.58      0.60      0.54    625415
         weighted avg       0.73      0.60      0.60    625415

0.552064817027156
1780.5212337970734


In [19]:
def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = deepcopy(data[:,-1])
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = RandomUnderSampler().fit_resample(train[:,:-1], train[:,-1])
        
        y_train = y_train.astype('int').reshape(len(X_train), 1)
        predictY[begin:end] = model.fit(X_train, y_train).predict(test[:,:-1])
    t = classification_report(data[:,-1], predictY)
    print(t)
    print(matthews_corrcoef(data[:,-1], predictY))

In [51]:

    
def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = deepcopy(data[:,-1]).reshape(len(data[:,-1]),1)
    for i in range(n_fold):
        begin = diff*i
        end = diff*(i+1)
#         if i == n_fold-1:
#             end = -1
        test = data[begin:end]
        train = deepcopy(data)
        train = np.delete(train, range(begin, end),axis=0)
        X_train, y_train = RandomUnderSampler().fit_resample(train[:,:-1], train[:,-1].astype(int))
#         print(train[:,-1].shape)
#         y_train = y_train.astype('int').reshape(len(X_train),1)
        model.fit(X_train, y_train)
        predictY[begin:end] = model.predict(test[:,:-1])
#         predictY[begin:end] = model.fit(train[:,:-1], train[:,-1].astype("int")).predict(test[:,:-1])
    t = classification_report(data[:,-1].astype("int"), predictY.astype("int"))
    print(t)
    print(matthews_corrcoef(data[:,-1].astype("int"), predictY.astype("int")))

In [49]:
t = time()
n_estimators  = [100]
max_depth = [5]

data = np.c_[X_train,y_train]
for i in n_estimators:
    for j in max_depth:
        model = XGBClassifier(n_estimators=i, max_depth=j,objective='mlogloss')
        kFoldCV(model, data)
print(time()-t)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59391
           1       0.89      0.98      0.93     35377
           2       0.31      0.54      0.39     55124
           3       0.31      0.40      0.35     55818
           4       0.97      0.84      0.90    121178
           5       1.00      0.70      0.83    183189
           6       0.91      0.94      0.92     40073
           7       0.48      0.61      0.54     22192
           8       0.81      0.82      0.81     53073

    accuracy                           0.75    625415
   macro avg       0.74      0.76      0.74    625415
weighted avg       0.82      0.75      0.78    625415

0.7184127474649835
334.30882954597473


In [52]:
t = time()
n_estimators  = [200]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = CatBoostClassifier(n_estimators=i, max_depth=j,verbose=False)
        kFoldCV(model, data)

print(time()-t)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59391
           1       0.80      0.97      0.88     35377
           2       0.31      0.53      0.40     55124
           3       0.31      0.42      0.36     55818
           4       0.95      0.81      0.88    121178
           5       1.00      0.70      0.83    183189
           6       0.89      0.93      0.91     40073
           7       0.45      0.60      0.52     22192
           8       0.80      0.79      0.80     53073

    accuracy                           0.75    625415
   macro avg       0.73      0.75      0.73    625415
weighted avg       0.81      0.75      0.77    625415

0.7089554782479809
271.6109025478363
