# Load indexes and normalisations

In [1]:
# where the data lives:

path = "~/data/LANL/"

In [19]:
# load box/port info from json:

import json
import re

data = json.load(open("boxport_data.json", 'r'))

def jsonIntKeys(x):
    if isinstance(x, dict):
            return {int(k):str(v) for k,v in x.items()}
    return x

def jsonIntVals(x):
    if isinstance(x, dict):
            return {str(k):int(v) for k,v in x.items()}
    return x

box_index = jsonIntVals( data['box_indices'] )
index_box = jsonIntKeys( data['indices_box'] )
port_index = jsonIntVals( data['port_indices'] )
index_port = jsonIntKeys( data['indices_port'] )

bbox = set([re.sub('[\"\n]+', "", s) for s in box_indices.keys()])

In [20]:
# define netflow RDD - filter to boxes in 'bbox' (busy computers):

flowfile = path + "flows.csv"
flows = sc.textFile(flowfile).map(lambda line: [str(x) for x in line.split(',')])

def filt(x):
    return [int(x[0]), int(x[1]), x[2], x[3], x[4], x[5], int(x[6]), int(x[7]), int(x[8])]

subflows = flows.filter(lambda f: f[2] in bbox and f[4] in bbox).map(filt)

In [23]:
# define normalisation mapping to be readable in Keras:

from math import log, floor

protocols = [1,6,17,41]
proto_index = dict((c, i) for i, c in enumerate(protocols))
index_proto = dict((i, c) for i, c in enumerate(protocols))

def logbin(x):
    return int(floor(log(x, 2)))

def normal_port(x):
    if re.match('^N', x)!=None: 
        return 'N' 
    else: 
        return x

def normalise(x):
    return [x[0], 
            logbin(x[1]+1),
            box_indices[x[2]],
            port_indices[normal_port(x[3])],
            box_indices[x[4]],
            port_indices[normal_port(x[5])],
            proto_index[x[6]], 
            logbin(x[7]), 
            logbin(x[8])]

# check:
subflows.map(normalise).take(10)

[[1, 0, 9501, 138, 2205, 122, 1, 3, 12],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8]]

Note that the schema is:

<ul>
<li> timestamp
<li> duration (log)
<li> source computer
<li> source port
<li> destination computer
<li> destination port
<li> protocol
<li> nr packets (log)
<li> nr bytes (log)
</ul>

In [142]:
# set input vector sizes:

lg_duration = 7
lg_packets = 22
lg_bytes = 32
nr_proto = 4
nr_box = len(indices_box)   # 10109
nr_port = len(indices_port) #   326

# RNN architecture

In [255]:
from keras.layers import Input, Embedding, LSTM, Dense, Merge
from keras.models import Model
from keras.utils.visualize_util import plot

# hyperparameters:
nhidden = [512, 512]
unroll = 32
embed_dim = 256

# netflow inputs:
src_input = Input(shape=(unroll,), dtype='int32', name='src_input')
dst_input = Input(shape=(unroll,), dtype='int32', name='dst_input')
src_pt_input = Input(shape=(unroll,), dtype='int32', name='src_pt')
dst_pt_input = Input(shape=(unroll,), dtype='int32', name='dst_pt')
proto_input = Input(shape=(unroll,), dtype='int32', name='proto')
duration_input = Input(shape=(unroll,), dtype='int32', name='duration')
packets_input = Input(shape=(unroll,), dtype='int32', name='packets')
bytes_input = Input(shape=(unroll,), dtype='int32', name='bytes')

# embed:
src = Embedding(output_dim=embed_dim, input_dim=nr_box)(src_input)
dst = Embedding(output_dim=embed_dim, input_dim=nr_box)(dst_input)
src_pt = Embedding(output_dim=128, input_dim=nr_port)(src_pt_input)
dst_pt = Embedding(output_dim=128, input_dim=nr_port)(dst_pt_input)
proto = Embedding(output_dim=2, input_dim=nr_proto)(proto_input)
duration = Embedding(output_dim=4, input_dim=lg_duration)(duration_input)
packets = Embedding(output_dim=8, input_dim=lg_packets)(packets_input)
bytes = Embedding(output_dim=16, input_dim=lg_bytes)(bytes_input)

# merge:
data_merged = merge([src, dst, src_pt, dst_pt, proto, duration, packets, bytes], mode='concat')

# add src computer for next time-step, as a query stream to train on:
next_src = Input(shape=(unroll,), dtype='int32', name='next_src')
query = Embedding(output_dim=embed_dim, input_dim=nr_box)(next_src)

# pass data and query to RNN layers:
inner = merge([data_merged, query], mode='concat')
for i in range(len(nhidden)-1):
    inner = LSTM(nhidden[i], return_sequences=True)(inner)
inner = LSTM(nhidden[-1], return_sequences=False)(inner)

# add softmax outputs:
proto_output = Dense(4, activation='softmax', name='proto_output')(inner)
duration_output = Dense(lg_duration, activation='softmax', name='duration_output')(inner)
packets_output = Dense(lg_packets, activation='softmax', name='packets_output')(inner)
bytes_output = Dense(lg_bytes, activation='softmax', name='bytes_output')(inner)
src_port_output = Dense(nr_port, activation='softmax', name='src_port_output')(inner)
dst_port_output = Dense(nr_port, activation='softmax', name='dst_port_output')(inner)

# add dst computer output:
next_dst = Dense(embed_dim, activation='relu', name='next_dst')(inner)

# put it all together:
model = Model(input=[src_input,
                     dst_input,
                     src_pt_input,
                     dst_pt_input,
                     proto_input,
                     duration_input,
                     packets_input, 
                     bytes_input,
                     next_src], 
              output=[proto_output,
                      duration_output,
                      packets_output,
                      bytes_output,
                      src_port_output,
                      dst_port_output,
                      next_dst])
model.compile(optimizer='rmsprop', 
              loss=['categorical_crossentropy' for i in range(6)] + ['mse'],
              loss_weights=[1., 1., 1., 1., 1., 1., 1.])

# ...and summarise:
model.summary()
plot(model, to_file='model.png')

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
bytes (InputLayer)                 (None, 32)          0                                            
____________________________________________________________________________________________________
dst_input (InputLayer)             (None, 32)          0                                            
____________________________________________________________________________________________________
dst_pt (InputLayer)                (None, 32)          0                                            
____________________________________________________________________________________________________
duration (InputLayer)              (None, 32)          0                                            
___________________________________________________________________________________________

# Testing the architecture

In [301]:
import numpy as np

N = 100
x = np.array( subflows.map(normalise).take(N) )

In [312]:
def make_training_vectors(z, mx, unroll=32, step=1):
    windows = []
    n = len(z)
    for i in range(0, n - unroll, step):
        windows.append(z[i: i + unroll])
    X = np.zeros((len(windows), unroll, mx), dtype='int32')
    for i, win in enumerate(windows):
        for t, w in enumerate(win):
            X[i, t, w] = 1
    return X

src = make_training_vectors(x[range(N-1), 2], nr_box)
dst = make_training_vectors(x[range(N-1), 4], nr_box)
src_pt = make_training_vectors(x[range(N-1), 3], nr_port)
dst_pt = make_training_vectors(x[range(N-1), 5], nr_port)
proto = make_training_vectors(x[range(N-1), 6], nr_proto)
duration = make_training_vectors(x[range(N-1), 1], lg_duration)
packets = make_training_vectors(x[range(N-1), 7], lg_packets)
bytes = make_training_vectors(x[range(N-1),8], lg_bytes)
next_src = make_training_vectors(x[range(1,N),2], nr_box)

In [319]:
next_src.shape

(67, 32, 10109)

In [314]:
model([
        src,
        dst,
        src_pt,
        dst_pt,
        proto,
        duration,
        packets,
        bytes,
        next_src   
      ])

ValueError: Dimensions Dimension(10109) and Dimension(326) are not compatible

In [293]:
?model