In [2]:
import pandas as pd
import numpy as np
import os
import datetime
from enum import Enum
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.utils import to_categorical
# import tensorflow_decision_forests as tfdf


In [3]:
#Categorize and enumerate all attacks in dataset
ATTACKS = ['DDoS', 'DoS', 'Mirai', 'Recon', 'Spoofing', 'Benign', 'Web', 'BruteForce']
ATTACKS_ENUM = Enum('ATTACKS', ATTACKS, start=0)
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [4]:
# =====Split Train / Test data======
# Dataset link-> https://www.unb.ca/cic/datasets/iotdataset-2023.html
#E. C. P. Neto, S. Dadkhah, R. Ferreira, A. Zohourian, R. Lu, A. A. Ghorbani. "CICIoT2023: A real-time dataset and benchmark for large-scale attacks in IoT environment," Sensor (2023) – (submitted to Journal of Sensors).

DATASET_DIRECTORY = 'dataset/'
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')] # all files
#df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('1-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv')] # smaller subset for faster testing, 17 files =  10% of whole dataset
#df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('11-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv')] # 2 files = 1%
df_sets.sort()
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

In [5]:
#=====Extract Data=====
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
] #columns 0-45
Y_columns = 'label' #column 46

all_columns = X_columns+[Y_columns]

NUMERIC_FEATURE_NAMES = [
      'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
      'Rate', 'Srate', 'Drate', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 
      'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
      'Radius', 'Covariance', 'Variance', 'Weight', 
]
CATEGORICAL_FEATURE_NAMES = [
      'fin_flag_number', 'syn_flag_number',
      'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
      'ece_flag_number', 'cwr_flag_number','HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
      'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
]


In [6]:
#=====Feature Scaling======
# columnsToScale = ['flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'Drate', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'fin_count']
# scale all
scaler = MinMaxScaler(feature_range=(0,1))
for train_set in tqdm(training_sets):
    df = pd.read_csv(DATASET_DIRECTORY + train_set, index_col=None, header=0, delimiter=',')[X_columns]
    x_train = scaler.fit(df)
    del df

100%|██████████| 135/135 [01:54<00:00,  1.18it/s]


## Define Model layers

In [75]:
verbose, epochs, batch_size = 0, 10, 512
activationFunction='relu'

# def getOtimizedSequentialModel():
#     model = Sequential()
#     model.add(Dense(46, activation=activationFunction))
#     model.add(Dense(30, activation=activationFunction))
#     model.add(Dense(8, activation='softmax'))
#     model.compile(loss=keras.losses.BinaryCrossentropy(),
#                     optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
#                     metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
#                     )
#     return model

def getManyLayersModel():
    model = Sequential()
    model.add(Dense(46, activation=activationFunction))
    model.add(Dense(30, activation=activationFunction))
    model.add(Dense(20, activation=activationFunction))
    model.add(Dense(12, activation=activationFunction))
    model.add(Dense(8, activation='softmax'))
    model.compile(loss=keras.losses.BinaryCrossentropy(),
                    optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
                    metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
                    )
    return model

def getManyLayersModel64():
    batch_size =  64
    model = Sequential()
    model.add(Dense(46, activation=activationFunction))
    model.add(Dense(30, activation=activationFunction))
    model.add(Dense(20, activation=activationFunction))
    model.add(Dense(12, activation=activationFunction))
    model.add(Dense(8, activation='softmax'))
    model.compile(loss=keras.losses.BinaryCrossentropy(),
                    optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
                    metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
                    )
    return model

def getManyLayersModel32():
    batch_size =  32
    model = Sequential()
    model.add(Dense(46, activation=activationFunction))
    model.add(Dense(30, activation=activationFunction))
    model.add(Dense(20, activation=activationFunction))
    model.add(Dense(12, activation=activationFunction))
    model.add(Dense(8, activation='softmax'))
    model.compile(loss=keras.losses.BinaryCrossentropy(),
                    optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
                    metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
                    )
    return model


ML_Models = [
            getManyLayersModel32(),
            getManyLayersModel64()


]
ML_Model_Names = [
            'ManyLayers32',
            'ManyLayers64',
            'Third',
            'Fourth'

]

# Train Models

In [76]:
print(f"Last ran on {len(ML_Models)} models, with {len(training_sets)} training sets on date: {datetime.datetime.now()}")
for train_set in tqdm(training_sets):
    df = pd.read_csv(DATASET_DIRECTORY + train_set, index_col=None, header=0, delimiter=',')[all_columns]
    x_train = scaler.transform(df[X_columns])
    y_train = to_categorical([ATTACKS_ENUM[dict_7classes[k]].value for k in df[Y_columns]], num_classes=8)

    for model in ML_Models:
            model.fit(x=x_train, 
                        y=y_train, 
                        epochs=epochs, 
                        verbose=verbose,
                        batch_size=batch_size)                   
    del df
    del x_train
    del y_train


Last ran on 3 models, with 135 training sets on date: 2024-05-31 22:54:25.001317


100%|██████████| 135/135 [1:16:27<00:00, 33.98s/it]


# Train a seperate model to detect each attack

In [77]:
# verbose, epochs, batch_size = 1, 100, 512
# activationFunction='relu'

# def getSequentialModel():
#     model = Sequential()
#     model.add(Dense(128, activation=activationFunction))
#     model.add(Dense(64, activation=activationFunction))
#     model.add(Dense(32, activation=activationFunction))
#     model.add(Dense(16, activation=activationFunction))
#     model.add(Dense(8, activation=activationFunction))
#     model.add(Dense(4, activation=activationFunction))
#     model.add(Dense(2, activation='softmax'))
#     model.compile(loss=keras.losses.BinaryCrossentropy(),
#                     optimizer=keras.optimizers.Adam(learning_rate=1e-3), 
#                     metrics=[ keras.metrics.BinaryAccuracy(), keras.metrics.FalseNegatives()]
#                     )
#     return model

# ML_Models = [
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel(),
#             getSequentialModel()

# ]
# ML_Model_Names = ATTACKS

In [78]:
# print(f"Last ran on {len(ML_Models)} models, with {len(training_sets)} training sets on date: {datetime.datetime.now()}")
# for train_set in tqdm(training_sets):
#     df = pd.read_csv(DATASET_DIRECTORY + train_set, index_col=None, header=0, delimiter=',')[all_columns]
#     x_train = scaler.transform(df[X_columns])

#     for i in range(len(ML_Models)-1):
#             y_train = to_categorical([ATTACKS_ENUM[dict_7classes[k]].value == ATTACKS_ENUM[ATTACKS[i]].value for k in df[Y_columns]], num_classes=2)
#             model = ML_Models[i]
#             model.fit(x=x_train, 
#                         y=y_train, 
#                         epochs=epochs, 
#                         verbose=verbose,
#                         batch_size=batch_size)   
#             del y_train             
#     del df
#     del x_train

In [79]:
# def showResults8Models(test, pred, model_num):
#     print(f"===== {model_num} =====")
#     print(classification_report(test, pred, target_names=["Negative", "Positive"]))
#     accuracy = accuracy_score(test, pred)
#     precision=precision_score(test, pred, average='weighted')
#     f1Score=f1_score(test, pred, average='weighted') 
#     print("Accuracy  : {}".format(accuracy))
#     print("Precision : {}".format(precision))
#     print("f1Score : {}".format(f1Score))
#     cm=confusion_matrix(test, pred)
#     print(cm) 

# print(f"Last ran on {len(ML_Models)} models, with {len(test_sets)} testing sets on date: {datetime.datetime.now()}")
# for i in range(len(ML_Models)):
#     model = ML_Models[i]
#     y_test = []
#     y_predict = []
#     for test_set in tqdm(test_sets):
#         df = pd.read_csv(DATASET_DIRECTORY + test_set, index_col=None, header=0, delimiter=',')[all_columns]
#         x_test = scaler.transform(df[X_columns])
#         for k in df[Y_columns]:
#             y_test.append(ATTACKS_ENUM[dict_7classes[k]].value==ATTACKS[i])
#         y_predict+= list(model.predict(x_test))

#         del df
#         del x_test

#     myarr = np.array([ATTACKS_ENUM[dict_7classes[k]].value == ATTACKS_ENUM[ATTACKS[0]].value for k in ['DDoS-RSTFINFlood','DDoS-PSHACK_Flood','DDoS-SYN_Flood','DoS-SYN_Flood','DoS-TCP_Flood','Mirai-udpplain','Recon-OSScan','DNS_Spoofing','BrowserHijacking','Backdoor_Malware','DictionaryBruteForce']])
#     print(myarr)
#     print(to_categorical(myarr, num_classes=2))
#     y_test=np.array(y_test)
#     print(y_test[0:10])
#     y_test = to_categorical(y_test, num_classes=2)
#     print(y_test[0:10])
#     print("=========")
#     for i in range(10):
#         print(f"{i}: {y_predict[i]} actual {y_test[i]}")    

#     test = np.argmax(y_test, axis=1)
#     predict = np.argmax(y_predict, axis=1)
#     showResults8Models(test, predict, i)

#     del test
#     del predict
#     del y_test
#     del y_predict

# Test the resulting trained models

In [80]:
# def saveOutput(output, model_name):
#     savepath = f"\outputs\{model_name}.txt"
#     output=output
#     %store output >>"\outputs\{}.txt".format(model_name)

In [8]:
def showResults(test, pred,model_name):
    output = ''
    output += str(datetime.datetime.now())
    output += f"\n===== {model_name} =====\n"
    output+=classification_report(test, pred, target_names=ATTACKS)
    accuracy = accuracy_score(test, pred)
    precision=precision_score(test, pred, average='weighted')
    f1Score=f1_score(test, pred, average='weighted') 
    output+=f"\nAccuracy  : {accuracy}\n"
    output+=f"Precision : {precision}\n"
    output+=f"f1Score : {f1Score}\n"
    cm=confusion_matrix(test, pred)
    output+=str(cm) 
    # try:
    #     %store output >>"outputs\output.txt" # instead of data from output variable, for whatever reason 'test\n' gets appended instead
    # except Exception as e:
    #     print("error saving to file")
    #     print(e)

    print(output)

In [10]:
def testModel(model, model_name):
    y_test = []
    y_predict = []
    for test_set in tqdm(test_sets):
        df = pd.read_csv(DATASET_DIRECTORY + test_set, index_col=None, header=0, delimiter=',')[all_columns]
        x_test = scaler.transform(df[X_columns])
        for k in df[Y_columns]:
            y_test.append(ATTACKS_ENUM[dict_7classes[k]].value)
        y_predict+= list(model.predict(x_test, verbose=0))

        del df
        del x_test

    y_test=np.array(y_test)
    y_test = to_categorical(y_test, num_classes=8)
    test = np.argmax(y_test, axis=1)
    predict = np.argmax(y_predict, axis=1)
    showResults(test, predict, model_name)

    del test
    del predict
    del y_test
    del y_predict

In [None]:
print(f"Last ran on {len(ML_Models)} models, with {len(test_sets)} testing sets on date: {datetime.datetime.now()}")
for i in range(len(ML_Models)):
    testModel(ML_Models[i],ML_Model_Names[i])

In [11]:
#====For debug ===
# test_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')] 
testModel(load_model("SavedModels\\ManyLayers32.keras"), "ManyLayers batchsize: 32 - from file")

100%|██████████| 34/34 [05:40<00:00, 10.01s/it]


2024-06-01 19:16:11.751535
===== ManyLayers batchsize: 32 - from file =====
              precision    recall  f1-score   support

        DDoS       1.00      0.99      0.99   7526151
         DoS       0.96      0.99      0.98   1792167
       Mirai       1.00      0.99      1.00    583677
       Recon       0.74      0.59      0.66     78630
    Spoofing       0.73      0.73      0.73    107798
      Benign       0.84      0.91      0.87    243322
         Web       0.00      0.00      0.00      5433
  BruteForce       0.98      0.13      0.22      2983

    accuracy                           0.98  10340161
   macro avg       0.78      0.67      0.68  10340161
weighted avg       0.98      0.98      0.98  10340161

Accuracy  : 0.9831268584696118
Precision : 0.9827566180458189
f1Score : 0.9826927922432303
[[7458558   66720     155     644      47      27       0       0]
 [  11747 1779509     270     540      95       6       0       0]
 [   1626    1569  580136     319      22       

# Save Models

In [90]:
for i in range(len(ML_Models)):
    ML_Models[i].save(f"SavedModels\\{ML_Model_Names[i]}.keras",overwrite=True)
    #del ML_Models[i]

# Random Forest Decision Tree

In [None]:
# Tree Visualisation
# from sklearn.tree import export_graphviz
# from IPython.display import Image
# import graphviz

NUM_TREES = 100
criterion = "gini" #“gini”, “entropy”, “log_loss”
MAX_DEPTH = None
MIN_SAMPLES = 2
MIN_SAMPLES_LEAF = 1
MIN_WEIGHT_FRACTION_LEAF=0.0,
MAX_FEATURES='sqrt',
MAX_LEAF_NODES=None,
MIN_IMPURITY_DECREASE=0.0,
BOOTSTRAP=True, 
OOB_SCORE=False, 

verbose = 0
epochs = 100
batch_size = 512


rf = RandomForestClassifier(
    n_estimators=NUM_TREES,
    criterion = criterion,
    max_depth=MAX_DEPTH,
    )

for train_set in tqdm(training_sets):
    df = pd.read_csv(DATASET_DIRECTORY + train_set, index_col=None, header=0, delimiter=',')[all_columns]
    x_train = scaler.transform(df[X_columns])
    y_train = to_categorical([ATTACKS_ENUM[dict_7classes[k]].value for k in df[Y_columns]], num_classes=8)

    rf.fit(x_train, y_train)      

    del df
    del x_train
    del y_train

In [86]:
# def specify_feature_usages():
#     feature_usages = []

#     for feature_name in NUMERIC_FEATURE_NAMES:
#         feature_usage = tfdf.keras.FeatureUsage(
#             name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL
#         )
#         feature_usages.append(feature_usage)

#     for feature_name in CATEGORICAL_FEATURE_NAMES:
#         feature_usage = tfdf.keras.FeatureUsage(
#             name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL
#         )
#         feature_usages.append(feature_usage)

#     return feature_usages


# def create_gbt_model():
#     # See all the model parameters in https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/GradientBoostedTreesModel
#     gbt_model = tfdf.keras.GradientBoostedTreesModel(
#         features=specify_feature_usages(),
#         exclude_non_specified_features=True,
#         num_trees=NUM_TREES,
#         max_depth=MAX_DEPTH,
#         min_examples=MIN_EXAMPLES,
#         subsample=SUBSAMPLE,
#         validation_ratio=VALIDATION_RATIO,
#         task=tfdf.keras.Task.CLASSIFICATION,
#     )

#     gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
#     return gbt_model

In [87]:
# model = tfdf.keras.RandomForestModel()
# for train_set in tqdm(training_sets):
#     tf_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(train_set,label="species")





In [88]:
# #======== Cross Validation ===========
# skf = StratifiedKFold(n_splits=5, shuffle=True)
# skf.get_n_splits(xTrain, yTrain)
# foldNum=0
# for train_index, val_index in skf.split(xTrain, yTrain):
#     foldNum+=1
#     print("Results for fold",foldNum)
#     X_train, X_val = X[train_index], X[val_index]
#     Y_train, Y_val = Y[train_index], Y[val_index]
    
#     # one hot encode
#     Y_train = to_categorical(Y_train)
#     Y_val = to_categorical(Y_val)
    
#     history = model.fit(X_train, Y_train, 
#                         validation_data = (X_val, Y_val), 
#                         epochs=epochs, 
#                         batch_size=batch_size)  
#     yPredict = model.predict(X_val)

#     #Converting one hot encoded test label to label    
#     pred = np.argmax(yPredict, axis=1)
#     val = np.argmax(Y_val, axis=1)
    
#     showResults(val, pred)

In [89]:
#============Test Phase============
# yPred = model.predict(xTest)
# yTest = to_categorical(yTest)
# pred = np.argmax(yPred, axis=1)
# test = np.argmax(yTest, axis=1)
# showResults(test, pred)