In [1]:
%cd ../../..

c:\Users\bram_\home\msc


# The big test: remove all but one bruteForce flow from week1 and all but one pingScan flow from week2

In [2]:
# from IPython.display import clear_output

# tensorflow_federated_nightly also bring in tf_nightly, which
# can causes a duplicate tensorboard install, leading to errors.
# !pip uninstall --yes tensorboard tb-nightly

# !pip install --upgrade tensorflow-federated-nightly
# !pip install --upgrade nest-asyncio
# !pip install --upgrade tb-nightly

import nest_asyncio
nest_asyncio.apply()

In [3]:
# TensorFlow and tf.keras
import tensorflow as tf
import tensorflow_federated as tff

# Helper libraries
import numpy as np
import pandas as pd
import collections

from sklearn.model_selection import train_test_split

from utils import cidds_001 as utils

print(tf.__version__)

2.2.0


# Prepare the datasets

## Load and shuffle datasets

In [4]:
# load and shuffle week1
week1 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week1-cleaned.feather')
week1_shuffled = week1.sample(frac=1, random_state=13).reset_index(drop=True)

# load and shuffle week2
week2 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week2-cleaned.feather')
week2_shuffled = week2.sample(frac=1, random_state=13).reset_index(drop=True)

## Normalize datasets

In [5]:
_ = utils.min_max_normalization(week1_shuffled, columns_to_normalize=utils.columns_to_normalize)
_ = utils.min_max_normalization(week2_shuffled, columns_to_normalize=utils.columns_to_normalize)

## Remove almost all flows of `bruteForce` and `pingScan` of respectively week1 and week2

In [6]:
# Remove all but one of the flows of bruteForce from week1 dataset
week1_excl = week1_shuffled.where(week1_shuffled['attack_type'] != 'bruteForce').dropna().reset_index(drop=True)
week1_one_brute = week1_shuffled.where(week1_shuffled['attack_type'] == 'bruteForce').dropna().head(n=1).reset_index(drop=True)
week1_excl = week1_excl.append(week1_one_brute)

# Remove all but one of the flows of pingScan from week2 dataset
week2_excl = week2_shuffled.where(week2_shuffled['attack_type'] != 'pingScan').dropna().reset_index(drop=True)
week2_one_ping = week2_shuffled.where(week2_shuffled['attack_type'] == 'pingScan').dropna().head(n=1).reset_index(drop=True)
week2_excl = week2_excl.append(week2_one_ping)

In [7]:
pd.DataFrame(week1_excl.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,7010897
1,bruteForce,1
2,dos,1252127
3,pingScan,3359
4,portScan,183511


In [8]:
pd.DataFrame(week2_excl.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,8515329
1,bruteForce,3366
2,dos,1706900
3,pingScan,1
4,portScan,82407


## Create balanced datasets (ignoring the  entries of the excluded attack type)

In [9]:
# extract a balanced dataset of the remaining attack types (ignoring the few entries of the excluded attack type)
week1_excl_balanced = utils.get_balanced_cidds(week1_excl, idx_min_n_after_argsort=1)
week2_excl_balanced = utils.get_balanced_cidds(week2_excl, idx_min_n_after_argsort=1)

### Confirm having created a balanced dataset

In [10]:
pd.DataFrame(week1_excl_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3359
1,bruteForce,1
2,dos,3359
3,pingScan,3359
4,portScan,3359


In [11]:
pd.DataFrame(week2_excl_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3366
1,bruteForce,3366
2,dos,3366
3,pingScan,1
4,portScan,3366


## Split datasets in training and test sets and use one hot encoding for the labels

In [12]:
# week 1
x_week1_excl = week1_excl_balanced.drop(columns=utils.columns_to_drop + ['attack_type'])
y_week1_excl = pd.get_dummies(week1_excl_balanced['attack_type'])
ohe_columns = y_week1_excl.columns
x_train_week1, x_test_week1, y_train_week1, y_test_week1 = train_test_split(
    x_week1_excl, y_week1_excl, test_size=0.2, random_state=13)

# week 2
x_week2_excl = week2_excl_balanced.drop(columns=utils.columns_to_drop + ['attack_type'])
y_week2_excl = pd.get_dummies(week2_excl_balanced['attack_type'])
x_train_week2, x_test_week2, y_train_week2, y_test_week2 = train_test_split(
    x_week2_excl, y_week2_excl, test_size=0.2, random_state=13)

## Convert features and labels to numpy arrays

In [13]:
# week 1
x_train_week1 = x_train_week1.to_numpy()
x_test_week1 = x_test_week1.to_numpy()
y_train_week1 = y_train_week1.to_numpy()
y_test_week1 = y_test_week1.to_numpy()

# week 2
x_train_week2 = x_train_week2.to_numpy()
x_test_week2 = x_test_week2.to_numpy()
y_train_week2 = y_train_week2.to_numpy()
y_test_week2 = y_test_week2.to_numpy()

# Preparation for the federated part

## Create the `federated_train_data`

In [14]:
NUM_EPOCHS = 5
BATCH_SIZE = 20
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10

def preprocess(dataset):
    def batch_format_fn(x, y):
        return collections.OrderedDict(
            x=x,
            y=y
        )
    
    return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER).batch(
        BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

In [15]:
def make_client_data():
    client_data_dict = {
        'client_1': tf.data.Dataset.from_tensor_slices((x_train_week1, y_train_week1)),
        'client_2': tf.data.Dataset.from_tensor_slices((x_train_week2, y_train_week2))
    }

    client_data = tff.simulation.ClientData.from_clients_and_fn(
        client_ids=['client_1', 'client_2'],
        create_tf_dataset_for_client_fn=lambda key: client_data_dict[key]
    )

    return client_data

In [16]:
def make_federated_data(client_data, client_ids):
    return [
        preprocess(client_data.create_tf_dataset_for_client(x))
        for x in client_ids
    ]

In [17]:
client_data = make_client_data()
client_ids = client_data.client_ids
federated_train_data = make_federated_data(client_data, client_ids)

## Obtain the element_spec of the input that the federated model will receive

In [18]:
preprocessed_example_data = federated_train_data[0]
tff_input_element_spec = preprocessed_example_data.element_spec

## Create functions to create the TFF model

In [19]:
def create_keras_model():
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape=(16,)),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(5, activation='softmax')
    ])

In [20]:
def model_fn():
    keras_model = create_keras_model()
    return tff.learning.from_keras_model(
        keras_model,
        input_spec=tff_input_element_spec,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=[tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.CategoricalCrossentropy()]
    )

## Train the model on federated data

In [21]:
iterative_process = tff.learning.build_federated_averaging_process(
    model_fn=model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0)
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [22]:
str(iterative_process.initialize.type_signature)

'( -> <model=<trainable=<float32[16,100],float32[100],float32[100,100],float32[100],float32[100,5],float32[5]>,non_trainable=<>>,optimizer_state=<int64>,delta_aggregate_state=<value_sum_process=<>,weight_sum_process=<>>,model_broadcast_state=<>>@SERVER)'

In [23]:
state = iterative_process.initialize()

In [24]:
NUM_ROUNDS = 20
for round_num in range(NUM_ROUNDS):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num+1, metrics))

round  1, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.8523117), ('categorical_crossentropy', 0.42880362), ('loss', 0.42880362)]))])
round  2, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.89009804), ('categorical_crossentropy', 0.30181763), ('loss', 0.30181763)]))])
round  3, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.8998095), ('categorical_crossentropy', 0.27551922), ('loss', 0.27551922)]))])
round  4, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.9079411), ('categ

# Test the global model

In [25]:
model = create_keras_model()
state.model.assign_weights_to(model)

## Concatenate test sets from week1 and week2 to obtain a bigger test set

In [26]:
test_x = np.concatenate([x_test_week1, x_test_week2])
test_y = np.concatenate([y_test_week1, y_test_week2])

## Assign the federated trained weights to a model that can be used

## Predict the test set and create a confusion matrix

In [27]:
pred_y = model.predict(test_x)

In [28]:
y_lbl = np.argmax(test_y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [29]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1322,    0,   17,    0,   15],
       [  76,    9,    8,    0,  585],
       [   0,    0, 1330,    0,    0],
       [  13,    0,    0,    1,  692],
       [  34,    0,    0,    0, 1279]])>

In [30]:
len(y_lbl)

5381

In [31]:
ohe_columns

Index(['---', 'bruteForce', 'dos', 'pingScan', 'portScan'], dtype='object')

# Test the global model with the training data

In [32]:
train_x = np.concatenate([x_train_week1, x_train_week2])
train_y = np.concatenate([y_train_week1, y_train_week2])

In [33]:
pred_y = model.predict(train_x)

In [34]:
y_lbl = np.argmax(train_y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [35]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[5204,    8,   82,    1,   76],
       [ 271,   27,   30,    0, 2361],
       [   2,    0, 5393,    0,    0],
       [  71,    0,    4,    2, 2577],
       [  98,    0,    0,    0, 5314]])>

In [36]:
len(y_lbl)

21521