In [1]:
%cd ../../..

c:\Users\bram_\home\msc


# The big test v2: remove all but 25% `bruteForce` flow from week1 and all but 25% `dos` flow from week2

In [2]:
# from IPython.display import clear_output

# tensorflow_federated_nightly also bring in tf_nightly, which
# can causes a duplicate tensorboard install, leading to errors.
# !pip uninstall --yes tensorboard tb-nightly

# !pip install --upgrade tensorflow-federated-nightly
# !pip install --upgrade nest-asyncio
# !pip install --upgrade tb-nightly

import nest_asyncio
nest_asyncio.apply()

In [3]:
# TensorFlow and tf.keras
import tensorflow as tf
import tensorflow_federated as tff

# Helper libraries
import numpy as np
import pandas as pd
import collections

from utils import cidds_001 as utils
from utils.tff_test import TffClientDataProvider

print(tf.__version__)

2.2.0


# Prepare the datasets

## Load and shuffle datasets

In [4]:
# load and shuffle week1
week1 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week1-cleaned.feather')
week1_shuffled = week1.sample(frac=1, random_state=13).reset_index(drop=True)

# load and shuffle week2
week2 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week2-cleaned.feather')
week2_shuffled = week2.sample(frac=1, random_state=13).reset_index(drop=True)

## Obtain a ClientData object

In [5]:
client_data_provider = TffClientDataProvider(
    week1=week1_shuffled,
    week2=week2_shuffled,
    drop_target1='bruteForce',
    drop_target2='dos',
    alpha_target1=0.25,
    alpha_target2=0.25,
    normalization_fn=utils.robust_scaling,
    random_state=13
)

Start preprocessing datasets week1 and week2
0.0s: Normalizing week1 and week2
10.11s: Creating balanced dataset of week1
57.81s: Creating balanced dataset of week2
110.51s: Removing 75.0% of bruteForce flows from week1
110.58s: Removing 75.0% of dos flows from week2
110.65s: Separate week1 features from dataset labels and one hot encode the labels
110.66s: Separate week2 features from dataset labels and one hot encode the labels
110.67s: Split datasets in training and testing datasets
110.68s: Convert features and labels to numpy arrays
110.69s: Finished preprocessing datasets week1 and week2


In [6]:
client_data = client_data_provider.make_client_data()

## Confirm having created balanced datasets

In [7]:
pd.DataFrame(client_data_provider.week1_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3359
1,bruteForce,406
2,dos,3359
3,pingScan,3359
4,portScan,3359


In [8]:
pd.DataFrame(client_data_provider.week2_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3366
1,bruteForce,3366
2,dos,841
3,pingScan,2731
4,portScan,3366


# Preparation for the federated part

## Create the `federated_train_data`

In [9]:
NUM_EPOCHS = 5
BATCH_SIZE = 20
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10

def preprocess(dataset):
    def batch_format_fn(x, y):
        return collections.OrderedDict(
            x=x,
            y=y
        )
    
    return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER).batch(
        BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

In [10]:
def make_federated_data(client_data, client_ids):
    return [
        preprocess(client_data.create_tf_dataset_for_client(x))
        for x in client_ids
    ]

In [11]:
client_ids = client_data.client_ids
federated_train_data = make_federated_data(client_data, client_ids)

## Obtain the element_spec of the input that the federated model will receive

In [12]:
preprocessed_example_data = federated_train_data[0]
tff_input_element_spec = preprocessed_example_data.element_spec

## Create functions to create the TFF model

In [13]:
def create_keras_model():
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape=(16,)),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(5, activation='softmax')
    ])

In [14]:
def model_fn():
    keras_model = create_keras_model()
    return tff.learning.from_keras_model(
        keras_model,
        input_spec=tff_input_element_spec,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=[tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.CategoricalCrossentropy()]
    )

## Train the model on federated data

In [15]:
iterative_process = tff.learning.build_federated_averaging_process(
    model_fn=model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0)
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [16]:
str(iterative_process.initialize.type_signature)

'( -> <model=<trainable=<float32[16,100],float32[100],float32[100,100],float32[100],float32[100,5],float32[5]>,non_trainable=<>>,optimizer_state=<int64>,delta_aggregate_state=<value_sum_process=<>,weight_sum_process=<>>,model_broadcast_state=<>>@SERVER)'

In [17]:
state = iterative_process.initialize()

In [18]:
NUM_ROUNDS = 20
for round_num in range(NUM_ROUNDS):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num+1, metrics))

round  1, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.7971466), ('categorical_crossentropy', 0.72074634), ('loss', 0.6393766)]))])
round  2, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.84145576), ('categorical_crossentropy', 0.522783), ('loss', 0.46144953)]))])
round  3, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.865555), ('categorical_crossentropy', 0.37307313), ('loss', 0.37161225)]))])
round  4, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.88172114), ('categori

# Test the global model

In [19]:
model = create_keras_model()
state.model.assign_weights_to(model)

## Concatenate test sets from week1 and week2 to obtain a bigger test set

In [20]:
test_x = np.concatenate([client_data_provider.x_test_week1, client_data_provider.x_test_week2])
test_y = np.concatenate([client_data_provider.y_test_week1, client_data_provider.y_test_week2])

## Assign the federated trained weights to a model that can be used

## Predict the test set and create a confusion matrix

In [21]:
pred_y = model.predict(test_x)

In [22]:
y_lbl = np.argmax(test_y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [23]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1277,    7,    1,   37,    2],
       [  35,  158,    0,  126,  427],
       [   0,    0,  840,    0,    0],
       [  38,    4,    0, 1152,    5],
       [  31,   30,    0,   56, 1277]])>

In [24]:
len(y_lbl)

5503

In [25]:
client_data_provider.ohe_columns

Index(['---', 'bruteForce', 'dos', 'pingScan', 'portScan'], dtype='object')

# Test the global model with the training data

In [26]:
train_x = np.concatenate([client_data_provider.x_train_week1, client_data_provider.x_train_week2])
train_y = np.concatenate([client_data_provider.y_train_week1, client_data_provider.y_train_week2])

In [27]:
pred_y = model.predict(train_x)

In [28]:
y_lbl = np.argmax(train_y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [29]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[5165,   46,    2,  171,   17],
       [ 157,  565,    0,  569, 1735],
       [   7,    0, 3352,    1,    0],
       [ 180,   26,    0, 4668,   17],
       [ 133,  169,    0,  251, 4778]])>

In [30]:
len(y_lbl)

22009