# Sequence Aggregation Rules for Anomaly Detection in Computer Network Traffic
## Replication File 1 of 3

Benjamin J. Radford, Bartley D. Richardson, and Shawn E. Davis

Paper available: [arXiv:1805.03735v2](https://arxiv.org/abs/1805.03735).

DISTRIBUTION STATEMENT A: Approved for public release. 

This research was developed with funding from the Defense Advanced Research Projects Agency (DARPA). The views, opinions and/or findings expressed are those of the authors and should not be interpreted as representing the official views or policies of the Department of Defense or the U.S. Government.

In [1]:
## Import dependencies
import numpy as np
import pickle
import pandas
import re
import glob
import datetime
import tensorflow as tf
import itertools
import math
import random
from gensim.models.word2vec import Word2Vec
from collections import Counter
from sklearn.metrics import log_loss, auc, roc_curve
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras import backend as K
from keras.layers import *
from keras.engine.topology import Input
from keras.models import Model, Sequential
from keras.utils import np_utils, to_categorical
from keras.optimizers import TFOptimizer, RMSprop

## Set random seeds for reproducibility
np.random.seed(123)
random.seed(123)

Using TensorFlow backend.


In [2]:
##
## Set modeling parameters
##

seq_len = 10
seq_skip = 3

w2v_size = 25
w2v_min_count = 3
w2v_window = 10
w2v_workers = 4

embedding_a_size = 100
lstm_a_size = 25
lstm_b_size = 25
dense_size = 100

validation_split = 0.1
batch_size = 2048
epochs = 10

cicids_training = datetime.datetime.strptime("2017-07-04 00:00:00", "%Y-%m-%d %H:%M:%S")

num_models = 3

In [3]:
##
## Read in CICIDS2017 Dataset
##

## Set a working directory and point to your data
wd = "./"
cicids_files = "data/CICIDS2017/*.csv"

## Load the data
print("Reading CICIDS2017 data...")
files = glob.glob(wd+cicids_files)
cicids_data = []
for ff in files:
    cicids_data.append(pandas.read_csv(ff, encoding="Latin1"))
cicids_data = pandas.concat(cicids_data)

## Set CICIDS2017 internal IPs in case they're required later
cicids_internal = set(["192.168.10.50", "205.174.165.68",
            "192.168.10.51", "205.174.165.66", "192.168.10.19",
            "192.168.10.17", "192.168.10.16",
            "192.168.10.12", "192.168.10.9",
            "192.168.10.5", "192.168.10.8",
            "192.168.10.14", "192.168.10.15",
            "192.168.10.25", "205.174.165.80", 
            "172.16.0.1","192.168.10.3"])

Reading CICIDS2017 data...


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


In [4]:
##
## Define functions required to clean CICIDS2017
##

def cicids_fixdate(x):
    """Fix dates - This is specific to the CICIDS2017 dataset."""
    try:
        d = datetime.datetime.strptime(x,"%d/%m/%Y %H:%M:%S")
    except:
        d = datetime.datetime.strptime(x,"%d/%m/%Y %H:%M")
    return(d)

def cicids_fixattacknames(x):
    """Fix attack names - This is specific to CICIDS2017 dataset."""
    return(re.sub('[^0-9a-zA-Z ]+', '', x))

In [5]:
##
## Clean CICIDS2017
## 

print("Fixing column names...")
cicids_data.columns = [a.lstrip() for a in cicids_data.columns]
cicids_data = cicids_data[["Source Port","Destination Port","Source IP","Destination IP","Label","Timestamp","Protocol","Average Packet Size","Total Fwd Packets","Total Backward Packets"]]
print(cicids_data.Label.value_counts())

print("Dropping incomplete records...")
cicids_data = cicids_data.dropna(axis=0,how="all")

print("Fixing the timestamps...")
cicids_data["Timestamp"] = cicids_data["Timestamp"].apply(lambda x: cicids_fixdate(x))

print("Fixing attack names...")
cicids_data["Label"] = cicids_data["Label"].apply(lambda x: cicids_fixattacknames(x))

print("Generating floor(log2(bytes)) feature...")
cicids_data["logbytes"] = cicids_data[["Average Packet Size","Total Fwd Packets","Total Backward Packets"]].apply(lambda x: math.floor(math.log(float(x[0] * (x[1]+x[2]))+1.,2)),axis=1)

print("Generate unique index values...")
cicids_data["index"] = list(range(0,cicids_data.shape[0]))

print("Removing entries without an internal machine...")
print("Before remove ext-ext: {}".format(str(cicids_data.shape[0])))
cicids_data = cicids_data.loc[(cicids_data["Source IP"].isin(cicids_internal) | cicids_data["Destination IP"].isin(cicids_internal))]
print("After remove ext-ext: {}".format(str(cicids_data.shape[0])))

print("Generate internal and external host fields...")
cicids_data["internal"] = cicids_data[["Source IP","Destination IP"]].apply(lambda x: x[0] if x[0] in cicids_internal else x[1], axis=1)
cicids_data["external"] = cicids_data[["Source IP","Destination IP"]].apply(lambda x: x[1] if x[0] in cicids_internal else x[0], axis=1)

print("Generating dyads...")
cicids_data["dyad"] = cicids_data[["Source IP","Destination IP"]].apply(lambda x: x[0]+":"+x[1], axis=1)

print("Generating hour bins...")
cicids_data["hour"] = cicids_data["Timestamp"].apply(lambda x: str(x)[0:13])

print("Generating protocol:port:floor(log(avg(bytes/packet))) feature (a.k.a. 'token_source' and 'token_destination')...")
cicids_data["protobytes"] = cicids_data[["Protocol","logbytes"]].apply(lambda x: str(x[0])+":"+str(x[1]), axis=1)
cicids_data["port"] = cicids_data[["Source Port","Destination Port"]].apply(lambda x: "port:"+str(min(min(x[0],10000),min(x[1],10000))), axis=1)
cicids_data["label"] = cicids_data["Label"].apply(lambda x: x)
cicids_data["time"] = cicids_data["Timestamp"].apply(lambda x: str(x))

Fixing column names...
BENIGN    713828
DDoS       41835
Name: Label, dtype: int64
Dropping incomplete records...
Fixing the timestamps...
Fixing attack names...
Generating floor(log2(bytes)) feature...
Generate unique index values...
Removing entries without an internal machine...
Before remove ext-ext: 755663
After remove ext-ext: 755384
Generate internal and external host fields...
Generating dyads...
Generating hour bins...
Generating protocol:port:floor(log(avg(bytes/packet))) feature (a.k.a. 'token_source' and 'token_destination')...


In [6]:
print("Generate naive token frequency models...")
protobytes_train = cicids_data[pandas.to_datetime(cicids_data["time"]) < cicids_training].protobytes.tolist()
protobytes_test = cicids_data[pandas.to_datetime(cicids_data["time"]) >= cicids_training].protobytes.tolist()
ports_train = cicids_data[pandas.to_datetime(cicids_data["time"]) < cicids_training].port.tolist()
ports_test = cicids_data[pandas.to_datetime(cicids_data["time"]) >= cicids_training].port.tolist()
labels = cicids_data[pandas.to_datetime(cicids_data["time"]) >= cicids_training].label.tolist()

Generate naive token frequency models...


In [7]:
protobytes_freq = Counter(protobytes_train)
ports_freq = Counter(ports_train)
protobytes_sum = float(sum([a for b,a in protobytes_freq.iteritems()]))
ports_sum = float(sum([a for b,a in ports_freq.iteritems()]))
protobytes_prob = {w:(f/protobytes_sum) for w,f in protobytes_freq.iteritems()}
ports_prob = {w:(f/ports_sum) for w,f in ports_freq.iteritems()}
protobytes_scores = np.asarray([protobytes_prob[a] if a in protobytes_prob else 0. for a in protobytes_test])
ports_scores = np.asarray([ports_prob[a] if a in ports_prob else 0. for a in ports_test])
pickle.dump(np.array(labels),open("results/freq_labels.pickle","wb"))
pickle.dump(ports_scores,open("results/freq_ports.pickle","wb"))
pickle.dump(protobytes_scores,open("results/freq_protobytes.pickle","wb"))

In [8]:
print("Generating forward and reverse dictionaries...")
protobytes_fwd_dict = {str(w):str(i+1) for i, w in enumerate(list(set(cicids_data["protobytes"].unique().tolist())))}
protobytes_fwd_dict["0"] = 0
protobytes_rev_dict = {i:w for w,i in protobytes_fwd_dict.iteritems()}
port_fwd_dict = {str(w):str(i+1) for i, w in enumerate(list(set(cicids_data["port"].unique().tolist())))}
port_fwd_dict["0"] = 0
port_rev_dict = {i:w for w,i in protobytes_fwd_dict.iteritems()}

cicids_data = cicids_data.replace({"protobytes": protobytes_fwd_dict, "port": port_fwd_dict})

Generating forward and reverse dictionaries...


In [9]:
##
## Generate sequences from CICIDS2017
##

## Concat token strings
print("Concat token strings...")
cicids_source_hour = cicids_data.sort_values(["Timestamp"]).groupby(["Source IP","hour"])[["protobytes","port","label","time"]].agg(lambda x: ",".join(x))
cicids_destination_hour = cicids_data.sort_values(["Timestamp"]).groupby(["Destination IP","hour"])[["protobytes","port","label","time"]].agg(lambda x: ",".join(x))
cicids_dyad_hour = cicids_data.sort_values(["Timestamp"]).groupby(["dyad","hour"])[["protobytes","port","label","time"]].agg(lambda x: ",".join(x))
cicids_internal_hour = cicids_data.sort_values(["Timestamp"]).groupby(["internal","hour"])[["protobytes","port","label","time"]].agg(lambda x: ",".join(x))
cicids_external_hour = cicids_data.sort_values(["Timestamp"]).groupby(["external","hour"])[["protobytes","port","label","time"]].agg(lambda x: ",".join(x))

## Generate sequences
print("Generate all sequences...")
cicids_source_hour["protobytes_sequence"] = cicids_source_hour["protobytes"].apply(lambda x: x.split(","))
cicids_destination_hour["protobytes_sequence"] = cicids_destination_hour["protobytes"].apply(lambda x: x.split(","))
cicids_dyad_hour["protobytes_sequence"] = cicids_dyad_hour["protobytes"].apply(lambda x: x.split(","))
cicids_internal_hour["protobytes_sequence"] = cicids_internal_hour["protobytes"].apply(lambda x: x.split(","))
cicids_external_hour["protobytes_sequence"] = cicids_external_hour["protobytes"].apply(lambda x: x.split(","))

cicids_source_hour["port_sequence"] = cicids_source_hour["port"].apply(lambda x: x.split(","))
cicids_destination_hour["port_sequence"] = cicids_destination_hour["port"].apply(lambda x: x.split(","))
cicids_dyad_hour["port_sequence"] = cicids_dyad_hour["port"].apply(lambda x: x.split(","))
cicids_internal_hour["port_sequence"] = cicids_internal_hour["port"].apply(lambda x: x.split(","))
cicids_external_hour["port_sequence"] = cicids_external_hour["port"].apply(lambda x: x.split(","))

cicids_source_hour["label_sequence"] = cicids_source_hour["label"].apply(lambda x: x.split(","))
cicids_destination_hour["label_sequence"] = cicids_destination_hour["label"].apply(lambda x: x.split(","))
cicids_dyad_hour["label_sequence"] = cicids_dyad_hour["label"].apply(lambda x: x.split(","))
cicids_internal_hour["label_sequence"] = cicids_internal_hour["label"].apply(lambda x: x.split(","))
cicids_external_hour["label_sequence"] = cicids_external_hour["label"].apply(lambda x: x.split(","))

cicids_source_hour["time_sequence"] = cicids_source_hour["time"].apply(lambda x: x.split(","))
cicids_destination_hour["time_sequence"] = cicids_destination_hour["time"].apply(lambda x: x.split(","))
cicids_dyad_hour["time_sequence"] = cicids_dyad_hour["time"].apply(lambda x: x.split(","))
cicids_internal_hour["time_sequence"] = cicids_internal_hour["time"].apply(lambda x: x.split(","))
cicids_external_hour["time_sequence"] = cicids_external_hour["time"].apply(lambda x: x.split(","))

## Add training set indicator
print("Add training set indicator...")
cicids_source_hour["training"] = cicids_source_hour["time_sequence"].apply(lambda x: max([datetime.datetime.strptime(a, "%Y-%m-%d %H:%M:%S") for a in x]) < cicids_training)
cicids_destination_hour["training"] = cicids_destination_hour["time_sequence"].apply(lambda x: max([datetime.datetime.strptime(a, "%Y-%m-%d %H:%M:%S") for a in x]) < cicids_training)
cicids_dyad_hour["training"] = cicids_dyad_hour["time_sequence"].apply(lambda x: max([datetime.datetime.strptime(a, "%Y-%m-%d %H:%M:%S") for a in x]) < cicids_training)
cicids_internal_hour["training"] = cicids_internal_hour["time_sequence"].apply(lambda x: max([datetime.datetime.strptime(a, "%Y-%m-%d %H:%M:%S") for a in x]) < cicids_training)
cicids_external_hour["training"] = cicids_external_hour["time_sequence"].apply(lambda x: max([datetime.datetime.strptime(a, "%Y-%m-%d %H:%M:%S") for a in x]) < cicids_training)

## Prepend sequences to correct length
print("Prepend sequences with zero values...")
cicids_source_hour["port_sequence"] = cicids_source_hour["port_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_destination_hour["port_sequence"] = cicids_destination_hour["port_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_dyad_hour["port_sequence"] = cicids_dyad_hour["port_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_internal_hour["port_sequence"] = cicids_internal_hour["port_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_external_hour["port_sequence"] = cicids_external_hour["port_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)

cicids_source_hour["protobytes_sequence"] = cicids_source_hour["protobytes_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_destination_hour["protobytes_sequence"] = cicids_destination_hour["protobytes_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_dyad_hour["protobytes_sequence"] = cicids_dyad_hour["protobytes_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_internal_hour["protobytes_sequence"] = cicids_internal_hour["protobytes_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_external_hour["protobytes_sequence"] = cicids_external_hour["protobytes_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)

cicids_source_hour["label_sequence"] = cicids_source_hour["label_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_destination_hour["label_sequence"] = cicids_destination_hour["label_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_dyad_hour["label_sequence"] = cicids_dyad_hour["label_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_internal_hour["label_sequence"] = cicids_internal_hour["label_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_external_hour["label_sequence"] = cicids_external_hour["label_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)

cicids_source_hour["time_sequence"] = cicids_source_hour["time_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_destination_hour["time_sequence"] = cicids_destination_hour["time_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_dyad_hour["time_sequence"] = cicids_dyad_hour["time_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_internal_hour["time_sequence"] = cicids_internal_hour["time_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)
cicids_external_hour["time_sequence"] = cicids_external_hour["time_sequence"].apply(lambda x: [0]*(seq_len - 1) + x)

## Split into training and test sets
print("Split into training and test sets...")
cicids_source_hour_training = cicids_source_hour[cicids_source_hour["training"]==True]
cicids_destination_hour_training = cicids_destination_hour[cicids_destination_hour["training"]==True]
cicids_dyad_hour_training = cicids_dyad_hour[cicids_dyad_hour["training"]==True]
cicids_internal_hour_training = cicids_internal_hour[cicids_internal_hour["training"]==True]
cicids_external_hour_training = cicids_external_hour[cicids_external_hour["training"]==True]

cicids_source_hour_testing = cicids_source_hour[cicids_source_hour["training"]==False]
cicids_destination_hour_testing = cicids_destination_hour[cicids_destination_hour["training"]==False]
cicids_dyad_hour_testing = cicids_dyad_hour[cicids_dyad_hour["training"]==False]
cicids_internal_hour_testing = cicids_internal_hour[cicids_internal_hour["training"]==False]
cicids_external_hour_testing = cicids_external_hour[cicids_external_hour["training"]==False]

# def subToken(corpus, count):
#     counts = Counter([a for sublist in corpus for a in sublist])
#     above_count = set([key for key,val in counts.iteritems() if val >= count])
#     return [map(lambda x: x if x in above_count else "UNK", a) for a in corpus]

# ## Word2Vec models for sequence tokens
# w2v_source_hour = Word2Vec(subToken(cicids_source_hour_training["sequence"].tolist(), w2v_min_count), min_count=w2v_min_count, size=w2v_size, window=w2v_window, workers=w2v_workers)
# w2v_destination_hour = Word2Vec(subToken(cicids_destination_hour_training["sequence"].tolist(), w2v_min_count), min_count=w2v_min_count, size=w2v_size, window=w2v_window, workers=w2v_workers)
# w2v_dyad_hour = Word2Vec(subToken(cicids_dyad_hour_training["sequence"].tolist(), w2v_min_count), min_count=w2v_min_count, size=w2v_size, window=w2v_window, workers=w2v_workers)
# w2v_internal_hour = Word2Vec(subToken(cicids_internal_hour_training["sequence"].tolist(), w2v_min_count), min_count=w2v_min_count, size=w2v_size, window=w2v_window, workers=w2v_workers)
# w2v_external_hour = Word2Vec(subToken(cicids_external_hour_training["sequence"].tolist(), w2v_min_count), min_count=w2v_min_count, size=w2v_size, window=w2v_window, workers=w2v_workers)


Concat token strings...
Generate all sequences...
Add training set indicator...
Prepend sequences with zero values...
Split into training and test sets...


In [10]:
print("Save stuff...")
pickle.dump(port_fwd_dict,open("data/port_fwd_dict.pickle","wb"))
pickle.dump(port_rev_dict,open("data/port_rev_dict.pickle","wb"))
pickle.dump(protobytes_fwd_dict,open("data/protobytes_fwd_dict.pickle","wb"))
pickle.dump(protobytes_rev_dict,open("data/protobytes_rev_dict.pickle","wb"))
cicids_source_hour_training.to_pickle("data/cicids_source_hour_training.pickle")
cicids_destination_hour_training.to_pickle("data/cicids_destination_hour_training.pickle")
cicids_dyad_hour_training.to_pickle("data/cicids_dyad_hour_training.pickle")
cicids_internal_hour_training.to_pickle("data/cicids_internal_hour_training.pickle")
cicids_external_hour_training.to_pickle("data/cicids_external_hour_training.pickle")
cicids_source_hour_testing.to_pickle("data/cicids_source_hour_testing.pickle")
cicids_destination_hour_testing.to_pickle("data/cicids_destination_hour_testing.pickle")
cicids_dyad_hour_testing.to_pickle("data/cicids_dyad_hour_testing.pickle")
cicids_internal_hour_testing.to_pickle("data/cicids_internal_hour_testing.pickle")
cicids_external_hour_testing.to_pickle("data/cicids_external_hour_testing.pickle")

Save stuff...


In [13]:
def get_label_mode(X):
    X = filter(lambda x: x != "X", X)
    if len(set(X)) == 1 and list(set(X))[0] == "BENIGN":
        return("BENIGN")
    else:
        X = [a for a in X if a != "BENIGN"]
        return(max(set(X), key=X.count))
    
def cicids_processing(sequences, labels, dict_size, seq_len, seq_skip, resample=False):
    X = []
    Y = []
    L = []
    for ii, token_seq in enumerate(sequences):
        label_seq = labels[ii]
        for jj in range(0, len(token_seq)-seq_len, seq_skip):
            X.append(token_seq[jj:(jj+seq_len)])
            Y.append(to_categorical(int(token_seq[jj+seq_len])-1, dict_size))
            L.append(label_seq[jj+seq_len])
            
    if resample==True:
        indices = np.random.choice(np.arange(len(X)),size=len(X),replace=True)
    else:
        indices = np.arange(len(X))
    return((np.array(X)[indices], np.array(Y)[indices], np.array(L)[indices]))

In [14]:
# cicids_source_hour_testing = pickle.load(open("data/cicids_source_hour_testing.pickle","rb"))
X_test, Y_test, L_test = cicids_processing(cicids_source_hour_testing["port_sequence"].tolist(),
                         cicids_source_hour_testing["label_sequence"].tolist(),
                         len(port_fwd_dict)-1, seq_len, seq_skip, False)

pickle.dump([X_test, Y_test, L_test],open("results/source_ports_truth.pickle","wb"))

for ii in range(num_models):
    
    X_train, Y_train, L_test = cicids_processing(cicids_source_hour_training["port_sequence"].tolist(),
                         cicids_source_hour_training["label_sequence"].tolist(),
                         len(port_fwd_dict)-1, seq_len, seq_skip, True)
    
    model_input = Input(shape=(seq_len, 1))
    embedding_a = Embedding(len(port_fwd_dict), 100, mask_zero=True)(model_input)
    lstm_a = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(embedding_a)
    dropout_a = Dropout(0.2)(lstm_a)
    lstm_b = Bidirectional(LSTM(lstm_b_size, return_sequences=False, activation="relu"), merge_mode="concat")(lstm_a)
    dropout_b = Dropout(0.2)(lstm_b)
    dense_layer = Dense(dense_size, activation="linear")(dropout_b)
    dropout_c = Dropout(0.2)(dense_layer)
    lstm_c = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(dropout_c)
    dropout_c = Dropout(0.2)(lstm_c)
    model_output = Dense(len(port_fwd_dict)-1)(dropout_c)

    model = Model(inputs=model_input, outputs=model_output)
    model.compile(optimizer=TFOptimizer(tf.contrib.opt.LazyAdamOptimizer()), loss='categorical_crossentropy')
    print(model.summary())
    
#     model.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

#     l2_in = Lambda(lambda x: K.l2_normalize(x, axis=2))(model_input)
#     l2_out = Lambda(lambda x: K.l2_normalize(x, axis=2))(lstm_d)
#     l2_multiply = Multiply()([l2_in,l2_out])
#     dot_product = Lambda(lambda x: -K.sum(x,axis=2))(l2_multiply)
#     output_max = Lambda(lambda x: K.max(x, axis=1, keepdims=True))(dot_product)
#     output_avg = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(dot_product)
    
#     flat_in = Flatten()(model_input)
#     flat_out = Flatten()(lstm_d)
#     output_dot = Dot(axes=(1,1), normalize=True)([flat_in, flat_out])

#     max_pred_model = Model(inputs=model_input, outputs=output_max)
#     avg_pred_model = Model(inputs=model_input, outputs=output_avg)
#     dot_pred_model = Model(inputs=model_input, outputs=output_dot)
    
#     source_max_preds = max_pred_model.predict(X_test, batch_size=batch_size)
#     source_avg_preds = avg_pred_model.predict(X_test, batch_size=batch_size)
#     source_dot_preds = dot_pred_model.predict(X_test, batch_size=batch_size)
    
#     pickle.dump([source_max_preds, source_avg_preds, source_dot_preds],
#                 open("asa_sdss_final/source_"+str(ii)+".pickle","wb"))
    
#     print(str(ii)+" complete.")

ValueError: Input 0 is incompatible with layer bidirectional_2: expected ndim=3, found ndim=4

In [None]:
destination_max_preds_list = []
destination_avg_preds_list = []
destination_dot_preds_list = []

X_test, Y_test = cicids_processing(cicids_destination_hour_testing["sequence"].tolist(),
                         cicids_destination_hour_testing["label_sequence"].tolist(),
                         w2v_destination_hour, w2v_size, seq_len, seq_skip, False)
pickle.dump([X_test, Y_test],open("asa_sdss_final/destination_truth.pickle","wb"))

for ii in range(num_models):
    
    X_train, Y_train = cicids_processing(cicids_destination_hour_training["sequence"].tolist(),
                         cicids_destination_hour_training["label_sequence"].tolist(),
                         w2v_destination_hour, w2v_size, seq_len, seq_skip, True)
    
    model_input = Input(shape=(seq_len, w2v_size))
    lstm_a = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(model_input)
    lstm_b = Bidirectional(LSTM(lstm_b_size, return_sequences=False), merge_mode="concat")(lstm_a)
    dropout_a = Dropout(0.2)(lstm_b)
    dense_layer = Dense(dense_size, activation="tanh")(dropout_a)
    repeat_layer = RepeatVector(seq_len)(dense_layer)
    dropout_b = Dropout(0.2)(repeat_layer)
    lstm_c = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(dropout_b)
    dropout_c = Dropout(0.2)(lstm_c)
    lstm_d = LSTM(w2v_size, return_sequences=True)(dropout_c)

    model = Model(inputs=model_input, outputs=lstm_d)
    model.compile(optimizer="RMSprop", loss='cosine_proximity')
    model.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

    l2_in = Lambda(lambda x: K.l2_normalize(x, axis=2))(model_input)
    l2_out = Lambda(lambda x: K.l2_normalize(x, axis=2))(lstm_d)
    l2_multiply = Multiply()([l2_in,l2_out])
    dot_product = Lambda(lambda x: -K.sum(x,axis=2))(l2_multiply)
    output_max = Lambda(lambda x: K.max(x, axis=1, keepdims=True))(dot_product)
    output_avg = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(dot_product)
    
    flat_in = Flatten()(model_input)
    flat_out = Flatten()(lstm_d)
    output_dot = Dot(axes=(1,1), normalize=True)([flat_in, flat_out])

    max_pred_model = Model(inputs=model_input, outputs=output_max)
    avg_pred_model = Model(inputs=model_input, outputs=output_avg)
    dot_pred_model = Model(inputs=model_input, outputs=output_dot)
    
    destination_max_preds = max_pred_model.predict(X_test, batch_size=batch_size)
    destination_avg_preds = avg_pred_model.predict(X_test, batch_size=batch_size)
    destination_dot_preds = dot_pred_model.predict(X_test, batch_size=batch_size)
    
    pickle.dump([destination_max_preds, destination_avg_preds, destination_dot_preds],
                open("asa_sdss_final/destination_"+str(ii)+".pickle","wb"))
    
    print(str(ii)+" complete.")

In [None]:
dyad_max_preds_list = []
dyad_avg_preds_list = []
dyad_dot_preds_list = []

X_test, Y_test = cicids_processing(cicids_dyad_hour_testing["sequence"].tolist(),
                         cicids_dyad_hour_testing["label_sequence"].tolist(),
                         w2v_dyad_hour, w2v_size, seq_len, seq_skip, False)
pickle.dump([X_test, Y_test],open("asa_sdss_final/dyad_truth.pickle","wb"))

for ii in range(num_models):
    
    X_train, Y_train = cicids_processing(cicids_dyad_hour_training["sequence"].tolist(),
                         cicids_dyad_hour_training["label_sequence"].tolist(),
                         w2v_dyad_hour, w2v_size, seq_len, seq_skip, True)
    
    model_input = Input(shape=(seq_len, w2v_size))
    lstm_a = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(model_input)
    lstm_b = Bidirectional(LSTM(lstm_b_size, return_sequences=False), merge_mode="concat")(lstm_a)
    dropout_a = Dropout(0.2)(lstm_b)
    dense_layer = Dense(dense_size, activation="tanh")(dropout_a)
    repeat_layer = RepeatVector(seq_len)(dense_layer)
    dropout_b = Dropout(0.2)(repeat_layer)
    lstm_c = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(dropout_b)
    dropout_c = Dropout(0.2)(lstm_c)
    lstm_d = LSTM(w2v_size, return_sequences=True)(dropout_c)

    model = Model(inputs=model_input, outputs=lstm_d)
    model.compile(optimizer="RMSprop", loss='cosine_proximity')
    model.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

    l2_in = Lambda(lambda x: K.l2_normalize(x, axis=2))(model_input)
    l2_out = Lambda(lambda x: K.l2_normalize(x, axis=2))(lstm_d)
    l2_multiply = Multiply()([l2_in,l2_out])
    dot_product = Lambda(lambda x: -K.sum(x,axis=2))(l2_multiply)
    output_max = Lambda(lambda x: K.max(x, axis=1, keepdims=True))(dot_product)
    output_avg = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(dot_product)
    
    flat_in = Flatten()(model_input)
    flat_out = Flatten()(lstm_d)
    output_dot = Dot(axes=(1,1), normalize=True)([flat_in, flat_out])

    max_pred_model = Model(inputs=model_input, outputs=output_max)
    avg_pred_model = Model(inputs=model_input, outputs=output_avg)
    dot_pred_model = Model(inputs=model_input, outputs=output_dot)
    
    dyad_max_preds = max_pred_model.predict(X_test, batch_size=batch_size)
    dyad_avg_preds = avg_pred_model.predict(X_test, batch_size=batch_size)
    dyad_dot_preds = dot_pred_model.predict(X_test, batch_size=batch_size)
    
    pickle.dump([dyad_max_preds, dyad_avg_preds, dyad_dot_preds],
                open("asa_sdss_final/dyad_"+str(ii)+".pickle","wb"))
    
    print(str(ii)+" complete.")

In [None]:
internal_max_preds_list = []
internal_avg_preds_list = []
internal_dot_preds_list = []

X_test, Y_test = cicids_processing(cicids_internal_hour_testing["sequence"].tolist(),
                         cicids_internal_hour_testing["label_sequence"].tolist(),
                         w2v_internal_hour, w2v_size, seq_len, seq_skip, False)
pickle.dump([X_test, Y_test],open("asa_sdss_final/internal_truth.pickle","wb"))

for ii in range(num_models):
    
    X_train, Y_train = cicids_processing(cicids_internal_hour_training["sequence"].tolist(),
                         cicids_internal_hour_training["label_sequence"].tolist(),
                         w2v_internal_hour, w2v_size, seq_len, seq_skip, True)
    
    model_input = Input(shape=(seq_len, w2v_size))
    lstm_a = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(model_input)
    lstm_b = Bidirectional(LSTM(lstm_b_size, return_sequences=False), merge_mode="concat")(lstm_a)
    dropout_a = Dropout(0.2)(lstm_b)
    dense_layer = Dense(dense_size, activation="tanh")(dropout_a)
    repeat_layer = RepeatVector(seq_len)(dense_layer)
    dropout_b = Dropout(0.2)(repeat_layer)
    lstm_c = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(dropout_b)
    dropout_c = Dropout(0.2)(lstm_c)
    lstm_d = LSTM(w2v_size, return_sequences=True)(dropout_c)

    model = Model(inputs=model_input, outputs=lstm_d)
    model.compile(optimizer="RMSprop", loss='cosine_proximity')
    model.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

    l2_in = Lambda(lambda x: K.l2_normalize(x, axis=2))(model_input)
    l2_out = Lambda(lambda x: K.l2_normalize(x, axis=2))(lstm_d)
    l2_multiply = Multiply()([l2_in,l2_out])
    dot_product = Lambda(lambda x: -K.sum(x,axis=2))(l2_multiply)
    output_max = Lambda(lambda x: K.max(x, axis=1, keepdims=True))(dot_product)
    output_avg = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(dot_product)
    
    flat_in = Flatten()(model_input)
    flat_out = Flatten()(lstm_d)
    output_dot = Dot(axes=(1,1), normalize=True)([flat_in, flat_out])

    max_pred_model = Model(inputs=model_input, outputs=output_max)
    avg_pred_model = Model(inputs=model_input, outputs=output_avg)
    dot_pred_model = Model(inputs=model_input, outputs=output_dot)
    
    internal_max_preds = max_pred_model.predict(X_test, batch_size=batch_size)
    internal_avg_preds = avg_pred_model.predict(X_test, batch_size=batch_size)
    internal_dot_preds = dot_pred_model.predict(X_test, batch_size=batch_size)
    
    pickle.dump([internal_max_preds, internal_avg_preds, internal_dot_preds],
                open("asa_sdss_final/internal_"+str(ii)+".pickle","wb"))
    
    print(str(ii)+" complete.")

In [None]:
external_max_preds_list = []
external_avg_preds_list = []
external_dot_preds_list = []

X_test, Y_test = cicids_processing(cicids_external_hour_testing["sequence"].tolist(),
                         cicids_external_hour_testing["label_sequence"].tolist(),
                         w2v_external_hour, w2v_size, seq_len, seq_skip, False)
pickle.dump([X_test, Y_test],open("asa_sdss_final/external_truth.pickle","wb"))

for ii in range(num_models):
    
    X_train, Y_train = cicids_processing(cicids_external_hour_training["sequence"].tolist(),
                         cicids_external_hour_training["label_sequence"].tolist(),
                         w2v_external_hour, w2v_size, seq_len, seq_skip, True)
    
    model_input = Input(shape=(seq_len, w2v_size))
    lstm_a = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(model_input)
    lstm_b = Bidirectional(LSTM(lstm_b_size, return_sequences=False), merge_mode="concat")(lstm_a)
    dropout_a = Dropout(0.2)(lstm_b)
    dense_layer = Dense(dense_size, activation="tanh")(dropout_a)
    repeat_layer = RepeatVector(seq_len)(dense_layer)
    dropout_b = Dropout(0.2)(repeat_layer)
    lstm_c = Bidirectional(LSTM(lstm_a_size, return_sequences=True), merge_mode="concat")(dropout_b)
    dropout_c = Dropout(0.2)(lstm_c)
    lstm_d = LSTM(w2v_size, return_sequences=True)(dropout_c)

    model = Model(inputs=model_input, outputs=lstm_d)
    model.compile(optimizer="RMSprop", loss='cosine_proximity')
    model.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

    l2_in = Lambda(lambda x: K.l2_normalize(x, axis=2))(model_input)
    l2_out = Lambda(lambda x: K.l2_normalize(x, axis=2))(lstm_d)
    l2_multiply = Multiply()([l2_in,l2_out])
    dot_product = Lambda(lambda x: -K.sum(x,axis=2))(l2_multiply)
    output_max = Lambda(lambda x: K.max(x, axis=1, keepdims=True))(dot_product)
    output_avg = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(dot_product)
    
    flat_in = Flatten()(model_input)
    flat_out = Flatten()(lstm_d)
    output_dot = Dot(axes=(1,1), normalize=True)([flat_in, flat_out])

    max_pred_model = Model(inputs=model_input, outputs=output_max)
    avg_pred_model = Model(inputs=model_input, outputs=output_avg)
    dot_pred_model = Model(inputs=model_input, outputs=output_dot)
    
    external_max_preds = max_pred_model.predict(X_test, batch_size=batch_size)
    external_avg_preds = avg_pred_model.predict(X_test, batch_size=batch_size)
    external_dot_preds = dot_pred_model.predict(X_test, batch_size=batch_size)
    
    pickle.dump([external_max_preds, external_avg_preds, external_dot_preds],
                open("asa_sdss_final/external_"+str(ii)+".pickle","wb"))
    
    print(str(ii)+" complete.")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

def evaluate_results(Y_test, score_arrays):
    models = {}
    for val, key in enumerate(np.unique(Y_test).tolist()):
        models[key] = {"fpr":[],"tpr":[],"threshold":[],"auc":[]}
        for preds in score_arrays:
            results_tuple = roc_curve(Y_test == key, preds if key!="BENIGN" else -preds, pos_label=1)
            models[key]["fpr"].append(results_tuple[0])
            models[key]["tpr"].append(results_tuple[1])
            models[key]["threshold"].append(results_tuple[2])
            models[key]["auc"].append(auc(results_tuple[0], results_tuple[1]))
    return models

def make_roc_plots(result_set, attack_names, title_postfix="", file_prefix=""):
    for key, value in attack_names:
        
        fpr = {i:v for i,v in enumerate(result_set[key]["fpr"])}
        tpr = {i:v for i,v in enumerate(result_set[key]["tpr"])}
        roc_auc = {i:v for i,v in enumerate(result_set[key]["auc"])}
        n_classes = len(fpr)
        
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])

        # Finally average it and compute AUC
        mean_tpr /= n_classes

        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        # Plot all ROC curves
        plt.figure()

        plt.plot(fpr["macro"], tpr["macro"],
                 label='Average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]),
                 color='black', linewidth=4)

        colors = cycle(['gray'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=1)

        plt.plot([0, 1], [0, 1], 'k--', lw=1)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(value + " ROC Curve"+title_postfix)
        plt.legend(loc="lower right")
        plt.savefig('asa_sdss_final_figs/'+file_prefix+key+"_dot.pdf")
        plt.show()

In [None]:
attacks = [("BENIGN","All Attacks"),
           ("Bot","Bot"),
           ("DDoS","DDoS"),
           ("DoS GoldenEye","DoS GoldenEye"),
           ("DoS Hulk","DoS Hulk"),
           ("DoS Slowhttptest","DoS Slow HTTP Test"),
           ("DoS slowloris","DoS Slow Loris"),
           ("FTPPatator","FTPPatator"),
           ("SSHPatator","SSHPatator"),
           ("Heartbleed","Heartbleed"),
           ("Infiltration","Infiltration"),
           ("PortScan","Port Scan"),
           ("Web Attack  Brute Force","Web Attack Brute Force"),
           ("Web Attack  Sql Injection","Web Attack SQL Injection"),
           ("Web Attack  XSS","Web Attack XSS")]

In [None]:
agg_rules = ["source","destination","dyad","internal","external"]
for agg in agg_rules:
    dot_0 = pickle.load(open("asa_sdss_final/"+agg+"_0.pickle","rb"))[2]
    dot_1 = pickle.load(open("asa_sdss_final/"+agg+"_1.pickle","rb"))[2]
    dot_2 = pickle.load(open("asa_sdss_final/"+agg+"_2.pickle","rb"))[2]
    Y_test = pickle.load(open("asa_sdss_final/"+agg+"_truth.pickle","rb"))[1]

    models_dot = evaluate_results(Y_test, [dot_0, dot_1, dot_2])
    make_roc_plots(models_dot, attacks, " - "+agg+" aggregation", "agg_")