In [132]:
import pandas as pd
import numpy as np
import os
import datetime
from enum import Enum
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.utils import to_categorical
# import tensorflow_decision_forests as tfdf


In [133]:
#Categorize and enumerate all attacks in dataset
ATTACKS = ['DDoS', 'DoS', 'Mirai', 'Recon', 'Spoofing', 'Benign', 'Web', 'BruteForce']
ATTACKS_ENUM = Enum('ATTACKS', ATTACKS, start=0)
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [134]:
# =====Split Train / Test data======
# Dataset link-> https://www.unb.ca/cic/datasets/iotdataset-2023.html
#E. C. P. Neto, S. Dadkhah, R. Ferreira, A. Zohourian, R. Lu, A. A. Ghorbani. "CICIoT2023: A real-time dataset and benchmark for large-scale attacks in IoT environment," Sensor (2023) – (submitted to Journal of Sensors).

DATASET_DIRECTORY = 'dataset/'
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')] # all files
#df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('1-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv')] # smaller subset for faster testing, 17 files =  10% of whole dataset
#df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('11-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv')] # 2 files = 1%
df_sets.sort()
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

In [135]:
#=====Extract Data=====
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
] #columns 0-45
Y_columns = 'label' #column 46

all_columns = X_columns+[Y_columns]

NUMERIC_FEATURE_NAMES = [
      'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
      'Rate', 'Srate', 'Drate', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 
      'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
      'Radius', 'Covariance', 'Variance', 'Weight', 
]
CATEGORICAL_FEATURE_NAMES = [
      'fin_flag_number', 'syn_flag_number',
      'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
      'ece_flag_number', 'cwr_flag_number','HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
      'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
]


In [136]:
#=====Feature Scaling======
# columnsToScale = ['flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'Drate', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'fin_count']
# scale all
scaler = MinMaxScaler(feature_range=(0,1))
for train_set in tqdm(training_sets):
    df = pd.read_csv(DATASET_DIRECTORY + train_set, index_col=None, header=0, delimiter=',')[X_columns]
    x_train = scaler.fit(df)
    del df

100%|██████████| 135/135 [03:02<00:00,  1.35s/it]


## Define Model layers

In [137]:
class Model:
    def __init__(self, model, name, type):
        self.model = model
        self.name = name
        self.type = type
        #self.batch_size = batch_size

TYPES = {}
TYPES['SK_LR'] = 1
TYPES['SK_RF'] = 2
TYPES['TF'] = 3

verbose, epochs, batch_size = 0, 10, 512
activationFunction='relu'

# def getOtimizedSequentialModel():
#     model = Sequential()
#     model.add(Dense(46, activation=activationFunction))
#     model.add(Dense(30, activation=activationFunction))
#     model.add(Dense(8, activation='softmax'))
#     model.compile(loss=keras.losses.BinaryCrossentropy(),
#                     optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
#                     metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
#                     )
#     return model

def getANN():
    model = Sequential()
    model.add(Dense(46, activation=activationFunction))
    model.add(Dense(30, activation=activationFunction))
    model.add(Dense(20, activation=activationFunction))
    model.add(Dense(12, activation=activationFunction))
    model.add(Dense(8, activation='softmax'))
    model.compile(loss=keras.losses.BinaryCrossentropy(),
                    optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
                    metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
                    )
    return Model(model, "ANN", TYPES['TF'])


def getCNN1():
    model = Sequential()
    model.add(Conv1D(32, 46, activation=activationFunction))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(64, 46, activation=activationFunction))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(64, 46, activation=activationFunction))
    model.add(Flatten())
    model.add(Dense(64, activation=activationFunction))
    model.add(Dense(8, activation='softmax'))
    model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
    return Model(model, 'CNN', TYPES['TF'])

def getRFModel():
    rf = RandomForestClassifier(
        n_estimators=100,
        criterion = 'gini',
        max_depth=None,
        )
    return Model(rf, "RF", TYPES['SK_RF'])

def getLRModel():
    lr = LogisticRegression()
    return Model(lr, "LR", TYPES['SK_LR'])



ML_Models = [
            getANN(),
            getRFModel(),
            getLRModel()
]


# Train Models

In [138]:
print(f"Last ran on {len(ML_Models)} models, with {len(training_sets)} training sets on date: {datetime.datetime.now()}")
for train_set in tqdm(training_sets):
    df = pd.read_csv(DATASET_DIRECTORY + train_set, index_col=None, header=0, delimiter=',')[all_columns]
    x_train = scaler.transform(df[X_columns])
    y_train = [ATTACKS_ENUM[dict_7classes[k]].value for k in df[Y_columns]]
    y_train_Cat = to_categorical(y_train, num_classes=8)

    for i in range(len(ML_Models)):
            model = ML_Models[i]
            if model.type==TYPES["SK_LR"]:
                model.model.fit(x_train, y_train)  
            
            elif model.type==TYPES["SK_RF"]:
                model.model.fit(x_train, y_train_Cat)  

            elif model.type == TYPES["TF"]:
                model.model.fit(x=x_train, 
                            y=y_train_Cat, 
                            epochs=epochs, 
                            verbose=verbose,
                            batch_size=batch_size)                   
    del df
    del x_train
    del y_train
    del y_train_Cat



Last ran on 3 models, with 135 training sets on date: 2024-06-04 19:51:36.545307


100%|██████████| 135/135 [3:06:34<00:00, 82.93s/it]  


# Train a seperate model to detect each attack

In [139]:
# verbose, epochs, batch_size = 1, 100, 512
# activationFunction='relu'

# def getSequentialModel():
#     model = Sequential()
#     model.add(Dense(128, activation=activationFunction))
#     model.add(Dense(64, activation=activationFunction))
#     model.add(Dense(32, activation=activationFunction))
#     model.add(Dense(16, activation=activationFunction))
#     model.add(Dense(8, activation=activationFunction))
#     model.add(Dense(4, activation=activationFunction))
#     model.add(Dense(2, activation='softmax'))
#     model.compile(loss=keras.losses.BinaryCrossentropy(),
#                     optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
#                     metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
#                     )
#     return model

# ML_Models = [
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel()

# ]
# ML_Model_Names = ATTACKS

In [140]:
# print(f"Last ran on {len(ML_Models)} models, with {len(training_sets)} training sets on date: {datetime.datetime.now()}")
# for train_set in tqdm(training_sets):
#     df = pd.read_csv(DATASET_DIRECTORY + train_set, index_col=None, header=0, delimiter=',')[all_columns]
#     x_train = scaler.transform(df[X_columns])

#     for i in range(len(ML_Models)-1):
#             y_train = to_categorical([ATTACKS_ENUM[dict_7classes[k]].value == ATTACKS_ENUM[ATTACKS[i]].value for k in df[Y_columns]], num_classes=2)
#             model = ML_Models[i]
#             model.fit(x=x_train, 
#                         y=y_train, 
#                         epochs=epochs, 
#                         verbose=verbose,
#                         batch_size=batch_size)   
#             del y_train             
#     del df
#     del x_train

In [141]:
# def showResults8Models(test, pred, model_num):
#     print(f"===== {model_num} =====")
#     print(classification_report(test, pred, target_names=["Negative", "Positive"]))
#     accuracy = accuracy_score(test, pred)
#     precision=precision_score(test, pred, average='weighted')
#     f1Score=f1_score(test, pred, average='weighted') 
#     print("Accuracy  : {}".format(accuracy))
#     print("Precision : {}".format(precision))
#     print("f1Score : {}".format(f1Score))
#     cm=confusion_matrix(test, pred)
#     print(cm) 

# print(f"Last ran on {len(ML_Models)} models, with {len(test_sets)} testing sets on date: {datetime.datetime.now()}")
# for i in range(len(ML_Models)):
#     model = ML_Models[i]
#     y_test = []
#     y_predict = []
#     for test_set in tqdm(test_sets):
#         df = pd.read_csv(DATASET_DIRECTORY + test_set, index_col=None, header=0, delimiter=',')[all_columns]
#         x_test = scaler.transform(df[X_columns])
#         for k in df[Y_columns]:
#             y_test.append(ATTACKS_ENUM[dict_7classes[k]].value==ATTACKS[i])
#         y_predict+= list(model.predict(x_test))

#         del df
#         del x_test

#     myarr = np.array([ATTACKS_ENUM[dict_7classes[k]].value == ATTACKS_ENUM[ATTACKS[0]].value for k in ['DDoS-RSTFINFlood','DDoS-PSHACK_Flood','DDoS-SYN_Flood','DoS-SYN_Flood','DoS-TCP_Flood','Mirai-udpplain','Recon-OSScan','DNS_Spoofing','BrowserHijacking','Backdoor_Malware','DictionaryBruteForce']])
#     print(myarr)
#     print(to_categorical(myarr, num_classes=2))
#     y_test=np.array(y_test)
#     print(y_test[0:10])
#     y_test = to_categorical(y_test, num_classes=2)
#     print(y_test[0:10])
#     print("=========")
#     for i in range(10):
#         print(f"{i}: {y_predict[i]} actual {y_test[i]}")    

#     test = np.argmax(y_test, axis=1)
#     predict = np.argmax(y_predict, axis=1)
#     showResults8Models(test, predict, i)

#     del test
#     del predict
#     del y_test
#     del y_predict

# Test the resulting trained models

In [142]:
def showResults(test, pred,model_name):
    output = ''
    output += str(datetime.datetime.now())
    output += f"\n===== {model_name} =====\n"
    output+=classification_report(test, pred, target_names=ATTACKS)
    accuracy = accuracy_score(test, pred)
    precision=precision_score(test, pred, average='weighted')
    f1Score=f1_score(test, pred, average='weighted') 
    output+=f"\nAccuracy  : {accuracy}\n"
    output+=f"Precision : {precision}\n"
    output+=f"f1Score : {f1Score}\n"
    cm=confusion_matrix(test, pred)
    output+=str(cm) 
    
    joblib.dump(output, f"outputs/{model_name}.txt") 

    print(output)

In [143]:
def testModel(model):
    y_test = []
    y_predict = []
    for test_set in tqdm(test_sets):
        df = pd.read_csv(DATASET_DIRECTORY + test_set, index_col=None, header=0, delimiter=',')[all_columns]
        x_test = scaler.transform(df[X_columns])
        for k in df[Y_columns]:
            y_test.append(ATTACKS_ENUM[dict_7classes[k]].value)
        if model.type == TYPES['TF']:
            y_predict+= list(model.model.predict(x_test, verbose=0))
        elif model.type == TYPES['SK_LR'] or model.type == TYPES["SK_RF"]:
            y_predict+= list(model.model.predict(x_test))

        del df
        del x_test

    y_test=np.array(y_test)
    if model.type == TYPES['TF'] or model.type ==TYPES["SK_RF"]:
        y_test = to_categorical(y_test, num_classes=8)
        y_test = np.argmax(y_test, axis=1)
        y_predict = np.argmax(y_predict, axis=1)
    showResults(y_test, y_predict, model.name)


    del y_test
    del y_predict

In [144]:
print(f"Last ran on {len(ML_Models)} models, with {len(test_sets)} testing sets on date: {datetime.datetime.now()}")
for i in range(len(ML_Models)):
    testModel(ML_Models[i])

Last ran on 3 models, with 34 testing sets on date: 2024-06-04 22:58:11.574781


100%|██████████| 34/34 [06:31<00:00, 11.53s/it]


2024-06-04 23:04:48.607024
===== ANN =====
              precision    recall  f1-score   support

        DDoS       0.91      0.96      0.94   7526151
         DoS       0.80      0.61      0.69   1792167
       Mirai       1.00      1.00      1.00    583677
       Recon       0.76      0.61      0.68     78630
    Spoofing       0.75      0.73      0.74    107798
      Benign       0.84      0.92      0.88    243322
         Web       0.87      0.03      0.06      5433
  BruteForce       0.99      0.14      0.25      2983

    accuracy                           0.90  10340161
   macro avg       0.86      0.63      0.65  10340161
weighted avg       0.89      0.90      0.89  10340161

Accuracy  : 0.8973367049120415
Precision : 0.8926709583323498
f1Score : 0.8916564773116894
[[7259520  265074     767     617     140      32       1       0]
 [ 705053 1086073     368     614      28      31       0       0]
 [   1880     685  580892     157      57       6       0       0]
 [    931     

100%|██████████| 34/34 [05:17<00:00,  9.33s/it]


2024-06-04 23:10:31.066860
===== RF =====
              precision    recall  f1-score   support

        DDoS       0.99      1.00      1.00   7526151
         DoS       1.00      1.00      1.00   1792167
       Mirai       1.00      1.00      1.00    583677
       Recon       0.96      0.63      0.76     78630
    Spoofing       0.93      0.75      0.83    107798
      Benign       0.93      0.93      0.93    243322
         Web       0.74      0.02      0.04      5433
  BruteForce       1.00      0.00      0.00      2983

    accuracy                           0.99  10340161
   macro avg       0.94      0.67      0.70  10340161
weighted avg       0.99      0.99      0.99  10340161

Accuracy  : 0.9918694689570114
Precision : 0.991555352342272
f1Score : 0.9910150172375725
[[7525630     394       6      67      46       8       0       0]
 [   1595 1790560       5       1       5       1       0       0]
 [   1528       5  582137       0       7       0       0       0]
 [  19189       

100%|██████████| 34/34 [01:38<00:00,  2.91s/it]


2024-06-04 23:12:33.389399
===== LR =====
              precision    recall  f1-score   support

        DDoS       0.81      0.99      0.89   7526151
         DoS       0.66      0.05      0.09   1792167
       Mirai       0.98      0.99      0.98    583677
       Recon       0.52      0.39      0.45     78630
    Spoofing       0.69      0.32      0.44    107798
      Benign       0.66      0.83      0.74    243322
         Web       0.00      0.00      0.00      5433
  BruteForce       0.00      0.00      0.00      2983

    accuracy                           0.81  10340161
   macro avg       0.54      0.45      0.45  10340161
weighted avg       0.79      0.81      0.75  10340161

Accuracy  : 0.8122512792595783
Precision : 0.7866618237630673
f1Score : 0.7467764438438975
[[7464511   29737    2411    6806    5413   17273       0       0]
 [1694710   90447     121    2412     438    4039       0       0]
 [   1909    2743  575350     436     592    2647       0       0]
 [  12797    39

In [145]:
#====For debug ===
# test_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')] 
#testModel(load_model("SavedModels\\BestANN.keras"), "Best ANN so far")
#testModel(Model(joblib.load("SavedModels/BestRF.pkl"),"debug_DF", TYPES['SK_RF']))

# Save Models

In [146]:
for i in range(len(ML_Models)):
    model = ML_Models[i]
    if model.type==TYPES["TF"]:
        model.model.save(f"SavedModels\\{model.name}.keras",overwrite=True)
    elif model.type==TYPES["SK_RF"] or model.type==TYPES["SK_LR"]:
        joblib.dump(model.model, f"SavedModels/{model.name}.pkl") 

KeyError: 'SK'