In [1]:
import pandas as pd
import os
import math
from datetime import datetime
import numpy as np
from numpy import argmax

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import sys

from imblearn.over_sampling import KMeansSMOTE, ADASYN,SMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from time import time
from sklearn.metrics import classification_report,matthews_corrcoef
from copy import deepcopy
from sklearn.utils import shuffle

# path to the dataset
# file = 'Shaleeza.Dataset.v1.csv'
file = 'IoT Network Intrusion Dataset.csv'
data = pd.read_csv(file)

In [2]:
def data_preprocessing(targets_others):
    dataset = pd.read_csv(file, error_bad_lines=False, low_memory=False)
    dataset = dataset.drop(['Flow_ID', 'Src_IP', 'Dst_IP', 'Dst_Port', 'Protocol'], axis=1)
    dataset = dataset.drop(['Timestamp'], axis=1)

    # contain only single values
    dataset = dataset.drop(
        ['Fwd_PSH_Flags', 'Fwd_URG_Flags', 'Fwd_Byts/b_Avg', 'Fwd_Pkts/b_Avg', 'Fwd_Blk_Rate_Avg', 'Bwd_Byts/b_Avg',
         'Bwd_Pkts/b_Avg', 'Bwd_Blk_Rate_Avg', 'Init_Fwd_Win_Byts', 'Fwd_Seg_Size_Min'], axis=1)

    dataset['Flow_Byts/s'] = round(dataset['Flow_Byts/s'],2)

    dataset = dataset.drop(targets_others, axis=1)

    dataset = dataset.reset_index()
    dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataset.dropna(inplace=True)

    # correlation
    correlated_features = set()
    correlation_matrix = dataset.corr()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) >= 0.7:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    dataset.drop(labels=correlated_features, axis=1, inplace=True)

    return dataset

# def kFoldCV(model, data, n_fold=10):
#     diff = int(len(data)/n_fold)
#     results = np.zeros((1, 4))
#     predictY = deepcopy(data[:,-1])
#     for i in range(n_fold):
#         begin = diff*i
#         end = diff*(i+1)
# #         if i == n_fold-1:
# #             end = -1
#         test = data[begin:end]
#         train = deepcopy(data)
#         train = np.delete(train, range(begin, end),axis=0)
#         X_train, y_train = SMOTE().fit_resample(train[:,:-1], train[:,-1])
#         predictY[begin:end] = model.fit(X_train, y_train).predict(test[:,:-1])
#     t = classification_report(data[:,-1], predictY)
#     print(t)
#     print(matthews_corrcoef(data[:,-1], predictY))
        
# #         results = results + getResults(model, train[:,:-1], train[:,-1],test[:,:-1],test[:,-1])
# #     return results/n_fold
def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = data[:,-1].astype('int')
    targetY = deepcopy(predictY).astype('int')
#     predictY = deepcopy(data[:,-1]).astype('int')
    cv = StratifiedKFold(n_splits=n_fold)
    X, y = data[:,:-1],data[:,-1].astype('int')
    begin = 0
    for fold, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, y_train = X[train_index], y[train_index]
#         print(X_train.shape,y[test_index].shape)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        X_test = sc.transform(X[test_index])  
        predictY[begin:begin+len(X_test)] = model.fit(X_train, y_train).predict(X_test)
        targetY[begin:begin+len(X_test)] = y[test_index]
        begin += len(X_test) 
#         targetY[begin:end] = test[:,-1]
#         predictY[begin:end] = model.fit(X_train, y_train).predict(X_test)
    t = classification_report(targetY, predictY)
    print(t)
    print(matthews_corrcoef(targetY, predictY))

def getResults(model, X_train, y_train,X_test,y):
    predictY = model.fit(X_train, y_train).predict(X_test)
    t = classification_report(y, predictY)#, target_names=['0', '1', '2']
    return t


In [3]:
data = data_preprocessing(['Label', 'Cat'])
data = data.values
data = data[:,1:]
X_train, y_train = data[:,:-1], data[:,-1]
y_train = LabelEncoder().fit_transform(y_train)

data = np.c_[X_train,y_train.astype('int')]



  dataset = pd.read_csv(file, error_bad_lines=False, low_memory=False)


In [None]:
# grid search for hyper-parameter tuning, after findin the optimal hyper-parameters, they are used for the final training and prediction
t = time()
length = [3, 4, 5]
for i in length:
    model = DecisionTreeClassifier(max_depth=i)
    kFoldCV(model, data)


print(time()-t)

In [6]:
t = time()
length = [5]
for i in length:
    model = DecisionTreeClassifier(max_depth=i)
    kFoldCV(model, data)


print(time()-t)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59391
           1       0.28      0.41      0.33     35377
           2       0.00      0.00      0.00     55124
           3       0.25      0.86      0.39     55818
           4       0.61      0.30      0.40    121178
           5       1.00      0.59      0.74    183189
           6       1.00      0.61      0.76     40073
           7       0.00      0.00      0.00     22192
           8       0.38      0.95      0.54     53073

    accuracy                           0.54    625415
   macro avg       0.50      0.52      0.46    625415
weighted avg       0.64      0.54      0.54    625415

0.5036565209743296
129.37770175933838


In [None]:
# grid search for hyper-parameter tuning, after findin the optimal hyper-parameters, they are used for the final training and prediction

t = time()
hidden_layer_sizes  = [100, 200, 300]
max_iter = [100, 200, 300]
for i in hidden_layer_sizes:
    for j in max_iter:
        model = MLPClassifier(hidden_layer_sizes=i, max_iter=j)
        kFoldCV(model, data)


print(time()-t)

In [7]:
t = time()
hidden_layer_sizes  = [100, 200, 300]
max_iter = [100, 200, 300]
hidden_layer_sizes  = [300]
max_iter = [300]
for i in hidden_layer_sizes:
    for j in max_iter:
        model = MLPClassifier(hidden_layer_sizes=i, max_iter=j)
        kFoldCV(model, data)


print(time()-t)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59391
           1       0.72      0.91      0.80     35377
           2       0.32      0.49      0.38     55124
           3       0.32      0.47      0.38     55818
           4       0.91      0.73      0.81    121178
           5       1.00      0.70      0.82    183189
           6       0.91      0.91      0.91     40073
           7       0.27      0.47      0.34     22192
           8       0.65      0.62      0.63     53073

    accuracy                           0.71    625415
   macro avg       0.68      0.70      0.68    625415
weighted avg       0.78      0.71      0.73    625415

0.6627325060427195
34490.25239944458


In [None]:
# grid search for hyper-parameter tuning, after findin the optimal hyper-parameters, they are used for the final training and prediction
t = time()
n_estimators  = [10, 100, 200]
max_depth = [3, 4, 5]
for i in n_estimators:
    for j in max_depth:
        model = RandomForestClassifier(n_estimators=i, max_depth=j)
        kFoldCV(model, data)

print(time()-t)

In [4]:
n_estimators  = [10, 100, 200]
max_depth = [3, 4, 5]
n_estimators  = [100]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = RandomForestClassifier(n_estimators=i, max_depth=j)
        kFoldCV(model, data)

print(time()-t)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59391
           1       0.32      0.88      0.47     35377
           2       0.30      0.44      0.36     55124
           3       0.31      0.41      0.35     55818
           4       0.89      0.22      0.35    121178
           5       0.98      0.71      0.82    183189
           6       0.80      0.85      0.82     40073
           7       0.18      0.14      0.16     22192
           8       0.47      0.83      0.60     53073

    accuracy                           0.60    625415
   macro avg       0.58      0.61      0.55    625415
weighted avg       0.72      0.60      0.60    625415

0.5528415274379004


NameError: name 't' is not defined

In [None]:
# grid search for hyper-parameter tuning, after findin the optimal hyper-parameters, they are used for the final training and prediction
t = time()
n_estimators  = [10, 100, 200]
max_depth = [3, 4, 5]
for i in n_estimators:
    for j in max_depth:
        model = XGBClassifier(n_estimators=i, max_depth=j,objective='mlogloss')
        kFoldCV(model, data)


print(time()-t)

In [5]:
from time import time
t = time()
n_estimators  = [10, 100, 200]
max_depth = [3, 4, 5]
n_estimators  = [200]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = XGBClassifier(n_estimators=i, max_depth=j,objective='mlogloss')
        kFoldCV(model, data)


print(time()-t)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59391
           1       0.92      0.98      0.95     35377
           2       0.29      0.55      0.38     55124
           3       0.28      0.31      0.29     55818
           4       0.97      0.86      0.91    121178
           5       1.00      0.70      0.83    183189
           6       0.91      0.94      0.92     40073
           7       0.49      0.60      0.54     22192
           8       0.80      0.83      0.82     53073

    accuracy                           0.75    625415
   macro avg       0.74      0.75      0.74    625415
weighted avg       0.82      0.75      0.77    625415

0.7163350390485673
3302.937599182129


In [None]:
def kFoldCV(model, data, n_fold=10):
    diff = int(len(data)/n_fold)
    results = np.zeros((1, 4))
    predictY = deepcopy(data[:,-1]).reshape(len(data[:,-1]),1).astype('int')
    targetY = deepcopy(data[:,-1]).reshape(len(data[:,-1]),1).astype('int')
#     predictY = deepcopy(data[:,-1]).astype('int')
    cv = StratifiedKFold(n_splits=n_fold)
    X, y = data[:,:-1],data[:,-1].astype('int')
    begin = 0
    for fold, (train_index, test_index) in enumerate(cv.split(X, y)):
        y = y.astype('int').reshape(len(y),1)
        X_train, y_train = X[train_index], y[train_index]
#         print(X_train.shape,y[test_index].shape)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        X_test = sc.transform(X[test_index])  
        predictY[begin:begin+len(X_test)] = model.fit(X_train, y_train).predict(X_test)
        targetY[begin:begin+len(X_test)] = y[test_index]
        begin += len(X_test) 
#         targetY[begin:end] = test[:,-1]
#         predictY[begin:end] = model.fit(X_train, y_train).predict(X_test)
    t = classification_report(targetY, predictY)
    print(t)
    print(matthews_corrcoef(targetY, predictY))

In [None]:
# grid search for hyper-parameter tuning, after findin the optimal hyper-parameters, they are used for the final training and prediction
t = time()
n_estimators  = [10, 100, 200]
max_depth = [3, 4, 5]
for i in n_estimators:
    for j in max_depth:
        model = CatBoostClassifier(n_estimators=i, max_depth=j)
        kFoldCV(model, data)

print(time()-t)

In [None]:
t = time()
n_estimators  = [10, 100, 200]
max_depth = [3, 4, 5]
n_estimators  = [200]
max_depth = [5]
for i in n_estimators:
    for j in max_depth:
        model = CatBoostClassifier(n_estimators=i, max_depth=j,verbose=False)
        kFoldCV(model, data)

print(time()-t)
#               precision    recall  f1-score   support

#            0       1.00      1.00      1.00     59391
#            1       0.83      0.97      0.89     35377
#            2       0.31      0.60      0.41     55124
#            3       0.31      0.33      0.32     55818
#            4       0.95      0.83      0.88    121178
#            5       1.00      0.70      0.83    183189
#            6       0.89      0.93      0.91     40073
#            7       0.47      0.59      0.52     22192
#            8       0.80      0.81      0.81     53073

#     accuracy                           0.75    625415
#    macro avg       0.73      0.75      0.73    625415
# weighted avg       0.82      0.75      0.77    625415

# 0.7121565931346628
# 1555.636743068695