In [1]:
%cd ../..

C:\Users\bram_\home\msc


# Single flow - Multi Layer Perceptron classification of CIDDS-001 OpenStack data
The implementation is based on the paper **Intelligent Cyber Attack Detetion and Classification for Network-Based Intrusion Detection Systems**. However, the training data that was used is now balanced. The data of the first and second week are joined together to obtain the biggest possible dataset to train with.

In [2]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import pandas as pd
import numpy as np
from utils import cidds_001 as utils

from sklearn.model_selection import train_test_split

print(tf.__version__)

2.2.0


## Load datasets

In [3]:
# load and shuffle week1
week1 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week1-cleaned.feather')
week1 = week1.sample(frac=1, random_state=13).reset_index(drop=True)

# load and shuffle week2
week2 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week2-cleaned.feather')
week2 = week2.sample(frac=1, random_state=13).reset_index(drop=True)

## Preprocess datasets
* Both datasets week1 and week2 are used to obtain as much malicous traffic of the underrepresented attack types as possible.
* Split the features and labels of the new dataset and encode the labels with one hot encoding.

In [4]:
week1_benign = week1.where(week1['attack_type'] == '---').dropna()
week1_portScan = week1.where(week1['attack_type'] == 'portScan').dropna()
week1_dos = week1.where(week1['attack_type'] == 'dos').dropna()
week1_pingScan = week1.where(week1['attack_type'] == 'pingScan').dropna()
week1_bruteForce = week1.where(week1['attack_type'] == 'bruteForce').dropna()

In [5]:
print(f'len(week1_benign) = {len(week1_benign)}')
print(f'len(week1_portScan) = {len(week1_portScan)}')
print(f'len(week1_dos) = {len(week1_dos)}')
print(f'len(week1_pingScan) = {len(week1_pingScan)}')
print(f'len(week1_bruteForce) = {len(week1_bruteForce)}')

len(week1_benign) = 7010897
len(week1_portScan) = 183511
len(week1_dos) = 1252127
len(week1_pingScan) = 3359
len(week1_bruteForce) = 1626


In [6]:
week2_benign = week2.where(week2['attack_type'] == '---').dropna()
week2_portScan = week2.where(week2['attack_type'] == 'portScan').dropna()
week2_dos = week2.where(week2['attack_type'] == 'dos').dropna()
week2_pingScan = week2.where(week2['attack_type'] == 'pingScan').dropna()
week2_bruteForce = week2.where(week2['attack_type'] == 'bruteForce').dropna()

In [7]:
print(f'len(week2_benign) = {len(week2_benign)}')
print(f'len(week2_portScan) = {len(week2_portScan)}')
print(f'len(week2_dos) = {len(week2_dos)}')
print(f'len(week2_pingScan) = {len(week2_pingScan)}')
print(f'len(week2_bruteForce) = {len(week2_bruteForce)}')

len(week2_benign) = 8515329
len(week2_portScan) = 82407
len(week2_dos) = 1706900
len(week2_pingScan) = 2731
len(week2_bruteForce) = 3366


In [8]:
dataset = pd.concat(
    [
        week1_pingScan, week2_pingScan, # use all pingScan flows of both wee1 and week2
        week1_bruteForce, week2_bruteForce, # use all bruteForce flows of both week1 and week2
        week1_benign.sample(frac=1, random_state=13).head(3000), # but use a small random sample of data of all other attack types
        week2_benign.sample(frac=1, random_state=13).head(3000),
        week1_dos.sample(frac=1, random_state=13).head(3000),
        week2_dos.sample(frac=1, random_state=13).head(3000),
        week1_portScan.sample(frac=1, random_state=13).head(3000),
        week2_portScan.sample(frac=1, random_state=13).head(3000),
    ]
).sample(frac=1, random_state=13)

In [9]:
print(f'len(dataset) = {len(dataset)}')

len(dataset) = 29082


In [10]:
pd.DataFrame(dataset.groupby(by='attack_type').size(), columns=['count'])

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,6000
bruteForce,4992
dos,6000
pingScan,6090
portScan,6000


In [11]:
data_x = dataset.drop(columns=utils.columns_to_drop + ['attack_type'])
data_y = pd.get_dummies(dataset['attack_type'])

## Noramalize features

In [12]:
_ = utils.z_score_normalization(data_x, utils.columns_to_normalize)

## Split training and testing datasets

In [13]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=13)

## Create and compile the MLP model

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(16,)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [15]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

## Fit the MLP model

In [16]:
x = x_train.to_numpy()
y = y_train.to_numpy()

In [17]:
model.fit(x=x, y=y,
          batch_size=1024,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x22353f1d940>

## Test the fitted model with unseen data

In [18]:
x = x_test.to_numpy()
y = y_test.to_numpy()

In [19]:
test_loss, test_acc = model.evaluate(x,  y, verbose=2)

print('\nTest accuracy:', test_acc)

182/182 - 0s - loss: 0.2037 - accuracy: 0.9347

Test accuracy: 0.9346742033958435


## Create a confusion matrix for the predictions on the test data

In [20]:
pred_y = model.predict(x)

In [21]:
y_lbl = np.argmax(y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [22]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1093,   86,   16,   11,    4],
       [  13,  989,    0,    5,   11],
       [   0,    0, 1158,    0,    0],
       [  28,   32,    0, 1170,    2],
       [  12,  120,    0,   40, 1027]])>

In [23]:
print(f'Columns of data_y (and confusion matrix): {data_y.columns}')

Columns of data_y (and confusion matrix): Index(['---', 'bruteForce', 'dos', 'pingScan', 'portScan'], dtype='object')
