In [None]:
%cd ../../..

# The big test v2: remove all but 25% `pingScan` flow from week1 and all but 25% `portScan` flow from week2

In [1]:
# from IPython.display import clear_output

# tensorflow_federated_nightly also bring in tf_nightly, which
# can causes a duplicate tensorboard install, leading to errors.
# !pip uninstall --yes tensorboard tb-nightly

# !pip install --upgrade tensorflow-federated-nightly
# !pip install --upgrade nest-asyncio
# !pip install --upgrade tb-nightly

import nest_asyncio
nest_asyncio.apply()

In [2]:
# TensorFlow and tf.keras
import tensorflow as tf
import tensorflow_federated as tff

# Helper libraries
import numpy as np
import pandas as pd
import collections

from utils import cidds_001 as utils
from utils.tff_test import TffClientDataProvider

print(tf.__version__)

2.2.0


# Prepare the datasets

## Load and shuffle datasets

In [3]:
# load and shuffle week1
week1 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week1-cleaned.feather')
week1_shuffled = week1.sample(frac=1, random_state=13).reset_index(drop=True)

# load and shuffle week2
week2 = pd.read_feather('saved_dfs/cidds-001/traffic/OpenStack/CIDDS-001-internal-week2-cleaned.feather')
week2_shuffled = week2.sample(frac=1, random_state=13).reset_index(drop=True)

## Obtain a ClientData object

In [None]:
client_data_provider = TffClientDataProvider(
    week1=week1_shuffled,
    week2=week2_shuffled,
    drop_target1='pingScan',
    drop_target2='portScan',
    alpha_target1=0.25,
    alpha_target2=0.25,
    normalization_fn=utils.robust_scaling,
    random_state=13
)

Start preprocessing datasets week1 and week2
0.0s: Normalizing week1 and week2
8.02s: Removing 75.0% of bruteForce flows from week1
49.11s: Removing 75.0% of pingScan flows from week2
101.55s: Creating balanced dataset of week1, ignoring the removed attack type
142.2s: Creating balanced dataset of week2, ignoring the removed attack type
190.16s: Separate week1 features from dataset labels and one hot encode the labels
190.19s: Separate week2 features from dataset labels and one hot encode the labels
190.2s: Split datasets in training and testing datasets
190.22s: Convert features and labels to numpy arrays
190.22s: Finished preprocessing datasets week1 and week2


In [None]:
client_data = client_data_provider.make_client_data()

## Confirm having created balanced datasets

In [None]:
pd.DataFrame(client_data_provider.week1_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3359
1,bruteForce,406
2,dos,3359
3,pingScan,3359
4,portScan,3359


In [None]:
pd.DataFrame(client_data_provider.week2_balanced.groupby(by='attack_type').size(), columns=['count']).reset_index()

Unnamed: 0,attack_type,count
0,---,3366
1,bruteForce,3366
2,dos,3366
3,pingScan,682
4,portScan,3366


# Preparation for the federated part

## Create the `federated_train_data`

In [None]:
NUM_EPOCHS = 5
BATCH_SIZE = 20
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10

def preprocess(dataset):
    def batch_format_fn(x, y):
        return collections.OrderedDict(
            x=x,
            y=y
        )
    
    return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER).batch(
        BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

In [None]:
def make_federated_data(client_data, client_ids):
    return [
        preprocess(client_data.create_tf_dataset_for_client(x))
        for x in client_ids
    ]

In [None]:
client_ids = client_data.client_ids
federated_train_data = make_federated_data(client_data, client_ids)

## Obtain the element_spec of the input that the federated model will receive

In [None]:
preprocessed_example_data = federated_train_data[0]
tff_input_element_spec = preprocessed_example_data.element_spec

## Create functions to create the TFF model

In [None]:
def create_keras_model():
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape=(16,)),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(rate=0.2),
        tf.keras.layers.Dense(5, activation='softmax')
    ])

In [None]:
def model_fn():
    keras_model = create_keras_model()
    return tff.learning.from_keras_model(
        keras_model,
        input_spec=tff_input_element_spec,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=[tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.CategoricalCrossentropy()]
    )

## Train the model on federated data

In [None]:
iterative_process = tff.learning.build_federated_averaging_process(
    model_fn=model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0)
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [None]:
str(iterative_process.initialize.type_signature)

'( -> <model=<trainable=<float32[17,100],float32[100],float32[100,100],float32[100],float32[100,5],float32[5]>,non_trainable=<>>,optimizer_state=<int64>,delta_aggregate_state=<value_sum_process=<>,weight_sum_process=<>>,model_broadcast_state=<>>@SERVER)'

In [None]:
state = iterative_process.initialize()

In [None]:
NUM_ROUNDS = 20
for round_num in range(NUM_ROUNDS):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num+1, metrics))

round  1, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.83152443), ('categorical_crossentropy', 0.49519578), ('loss', 0.49519578)]))])
round  2, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.87262493), ('categorical_crossentropy', 0.35491952), ('loss', 0.35491952)]))])
round  3, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.88528293), ('categorical_crossentropy', 0.32444102), ('loss', 0.32444102)]))])
round  4, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('categorical_accuracy', 0.89334047), ('ca

# Test the global model

In [None]:
model = create_keras_model()
state.model.assign_weights_to(model)

## Concatenate test sets from week1 and week2 to obtain a bigger test set

In [None]:
test_x = np.concatenate([client_data_provider.x_test_week1, client_data_provider.x_test_week2])
test_y = np.concatenate([client_data_provider.y_test_week1, client_data_provider.y_test_week2])

## Assign the federated trained weights to a model that can be used

## Predict the test set and create a confusion matrix

In [None]:
pred_y = model.predict(test_x)

In [None]:
y_lbl = np.argmax(test_y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [None]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1296,   29,   24,    0,    6],
       [  47,  290,    7,    0,  416],
       [   0,    0, 1344,    0,    0],
       [  23,   14,    0,  809,    2],
       [  15,   31,    0,   41, 1205]])>

In [None]:
len(y_lbl)

5599

In [None]:
client_data_provider.ohe_columns

Index(['---', 'bruteForce', 'dos', 'pingScan', 'portScan'], dtype='object')

# Test the global model with the training data

In [None]:
train_x = np.concatenate([client_data_provider.x_train_week1, client_data_provider.x_train_week2])
train_y = np.concatenate([client_data_provider.y_train_week1, client_data_provider.y_train_week2])

In [None]:
pred_y = model.predict(train_x)

In [None]:
y_lbl = np.argmax(train_y, axis=1)
pred_y_lbl = np.argmax(pred_y, axis=1)

In [None]:
tf.math.confusion_matrix(labels=y_lbl, predictions=pred_y_lbl, num_classes=5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[5114,  142,   90,    0,   24],
       [ 190, 1047,   54,    0, 1721],
       [   1,    1, 5379,    0,    0],
       [  84,   49,    4, 3047,    9],
       [  67,  123,    0,  185, 5058]])>

In [None]:
len(y_lbl)

22389