In [1]:
%cd ../..

C:\Users\bram_\home\msc


# Single flow - Multi Layer Perceptron classification of CIDDS-001 OpenStack data
The implementation is based on the paper **Intelligent Cyber Attack Detetion and Classification for Network-Based Intrusion Detection Systems**. Although the accuracy results are very good (more than 99%), the confusion matrix shows a big drawback: the attack classes that are underrepresented in the training dataset, are not well classified. This is not suprising, but this is not mentioned in the paper itself.

In [2]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import pandas as pd
import numpy as np
from utils import cidds_001 as utils
from datetime import datetime

print(tf.__version__)

2.2.0


## Load datasets

In [3]:
week1 = utils.load_internal_week1()
week2 = utils.load_internal_week2()

## Preprocess datasets
* make sure that the type of column `date_first_seen` is a datetime
* only use flows between `2017-03-17 14:18:05` and `2017-03-20 17:42:17` as stated in the paper
* split the features and the labels and encode the labels with one hot encoding

In [4]:
week1['date_first_seen'] = pd.to_datetime(week1['date_first_seen'])
week2['date_first_seen'] = pd.to_datetime(week2['date_first_seen'])

In [5]:
start_date = datetime.strptime('2017-03-17 14:18:05', '%Y-%m-%d %H:%M:%S')
end_date = datetime.strptime('2017-03-20 17:42:17', '%Y-%m-%d %H:%M:%S')

In [6]:
cond = (week1['date_first_seen'] >= start_date) & (week1['date_first_seen'] <= end_date)
week1_selection = week1.where(cond).dropna()

In [7]:
week1_x = week1_selection.drop(columns=utils.columns_to_drop + ['attack_type'])
week1_y = pd.get_dummies(week1_selection['attack_type'])

week2_x = week2.drop(columns=utils.columns_to_drop + ['attack_type'])
week2_y = pd.get_dummies(week2['attack_type'])

## Inspect training data

In [8]:
pd.DataFrame(week1_selection.groupby(by='attack_type').size(), columns=['count'])

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,2092587
bruteForce,1262
dos,390932
pingScan,1068
portScan,50136


## Inspect test data

In [9]:
pd.DataFrame(week2.groupby(by='attack_type').size(), columns=['count'])

Unnamed: 0_level_0,count
attack_type,Unnamed: 1_level_1
---,8515329
bruteForce,3366
dos,1706900
pingScan,2731
portScan,82407


## Normalize features

In [10]:
params = utils.min_max_normalization(week1_x, utils.columns_to_normalize)
utils._min_max_normalization_with_given_params(week2_x, utils.columns_to_normalize, params)

## Create and compile the MLP model

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(16,)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [12]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

## Transform feature and label DataFrames to numpy arrays

In [13]:
x = week1_x.to_numpy()
y = week1_y.to_numpy()

## Fit the model with the training data

In [14]:
model.fit(x=x, y=y,
          batch_size=1024,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1c100644940>

## Test the fitted model with the data of week2

In [15]:
x = week2_x.to_numpy()
y = week2_y.to_numpy()

In [16]:
test_loss, test_acc = model.evaluate(x,  y, verbose=2)

print('\nTest accuracy:', test_acc)

322211/322211 - 248s - loss: 0.0219 - accuracy: 0.9963

Test accuracy: 0.9963391423225403


## Create a confusion matrix of the predictions on the test data

In [17]:
pred_y = model.predict(x)

In [18]:
y_lbl = np.argmax(y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [19]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[8493346,    1730,   12897,       0,    7356],
       [   1586,     484,       0,       0,    1296],
       [   1712,       0, 1705181,       0,       7],
       [    243,       0,       0,       7,    2481],
       [   8392,      46,       0,       0,   73969]])>