# Load indexes and normalisations

In [1]:
# where the data lives:

path = "/Users/wmoxbury/data/LANL/"

In [2]:
# load box/port info from json:

import json
import re

data = json.load(open("boxport_data.json", 'r'))

def jsonIntKeys(x):
    if isinstance(x, dict):
            return {int(k):str(v) for k,v in x.items()}
    return x

def jsonIntVals(x):
    if isinstance(x, dict):
            return {str(k):int(v) for k,v in x.items()}
    return x

box_index = jsonIntVals( data['box_indices'] )
index_box = jsonIntKeys( data['indices_box'] )
port_index = jsonIntVals( data['port_indices'] )
index_port = jsonIntKeys( data['indices_port'] )

bbox = set([re.sub('[\"\n]+', "", s) for s in box_index.keys()])

In [3]:
# define netflow RDD - filter to boxes in 'bbox' (busy computers):

flowfile = path + "flows.csv"
flows = sc.textFile(flowfile).map(lambda line: [str(x) for x in line.split(',')])

def filt(x):
    return [int(x[0]), int(x[1]), x[2], x[3], x[4], x[5], int(x[6]), int(x[7]), int(x[8])]

subflows = flows.filter(lambda f: f[2] in bbox and f[4] in bbox).map(filt)

In [4]:
# define normalisation mapping to be readable in Keras:

from math import log, floor

protocols = [1,6,17,41]
proto_index = dict((c, i) for i, c in enumerate(protocols))
index_proto = dict((i, c) for i, c in enumerate(protocols))

def logbin(x):
    return int(floor(log(x, 2)))

def normal_port(x):
    if re.match('^N', x)!=None: 
        return 'N' 
    else: 
        return x

def normalise(x):
    return [x[0], 
            logbin(x[1]+1),
            box_index[x[2]],
            port_index[normal_port(x[3])],
            box_index[x[4]],
            port_index[normal_port(x[5])],
            proto_index[x[6]], 
            logbin(x[7]), 
            logbin(x[8])]

# check:
subflows.map(normalise).take(10)

[[1, 0, 9501, 138, 2205, 122, 1, 3, 12],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8]]

Note that the schema is:

<ul>
<li> timestamp
<li> duration (log)
<li> source computer
<li> source port
<li> destination computer
<li> destination port
<li> protocol
<li> nr packets (log)
<li> nr bytes (log)
</ul>

In [5]:
# set input vector sizes:

lg_duration = 7
lg_packets = 22
lg_bytes = 32
nr_proto = 4
nr_box = len(index_box)   # 10109
nr_port = len(index_port) #   326

# RNN architecture

In [6]:
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.models import Model
from keras.engine.topology import merge
from keras.utils.visualize_util import plot

# hyperparameters:
nhidden = [512, 512]
unroll = 32
embed_dim = 256
dropout_W = 0.2 # input gates
dropout_U = 0.2 # recurrent connections

# netflow inputs:
src_input = Input(shape=(unroll,), dtype='int32', name='src_input')
dst_input = Input(shape=(unroll,), dtype='int32', name='dst_input')
src_pt_input = Input(shape=(unroll,), dtype='int32', name='src_pt')
dst_pt_input = Input(shape=(unroll,), dtype='int32', name='dst_pt')
proto_input = Input(shape=(unroll,), dtype='int32', name='proto')
duration_input = Input(shape=(unroll,), dtype='int32', name='duration')
packets_input = Input(shape=(unroll,), dtype='int32', name='packets')
bytes_input = Input(shape=(unroll,), dtype='int32', name='bytes')

# shared embedding for computer feeds:
comp_encoding = Embedding(output_dim=embed_dim, input_dim=nr_box, input_length=unroll)
src = comp_encoding(src_input)
dst = comp_encoding(dst_input)

# other embeddings:
src_pt = Embedding(output_dim=128, input_dim=nr_port, input_length=unroll)(src_pt_input)
dst_pt = Embedding(output_dim=128, input_dim=nr_port, input_length=unroll)(dst_pt_input)
proto = Embedding(output_dim=2, input_dim=nr_proto, input_length=unroll)(proto_input)
duration = Embedding(output_dim=4, input_dim=lg_duration, input_length=unroll)(duration_input)
packets = Embedding(output_dim=8, input_dim=lg_packets, input_length=unroll)(packets_input)
bytes = Embedding(output_dim=16, input_dim=lg_bytes, input_length=unroll)(bytes_input)

# merge:
data_merged = merge([src_pt, dst_pt, proto, duration, packets, bytes], mode='concat')

# add src computer for next time-step, as a query stream to train on:
next_src = Input(shape=(unroll,), dtype='int32', name='next_src')
query = comp_encoding(next_src)

# pass data and query to RNN layers:
inner = merge([data_merged, src, dst, query], mode='concat')
for i in range(len(nhidden)-1):
    inner = LSTM(nhidden[i], return_sequences=True, dropout_U=dropout_U, dropout_W=dropout_W)(inner)
inner = LSTM(nhidden[-1], return_sequences=False, dropout_U=dropout_U, dropout_W=dropout_W)(inner)
inner = Dropout(dropout_W)(inner)

# add softmax outputs:
proto_output = Dense(4, activation='softmax', name='proto_output')(inner)
duration_output = Dense(lg_duration, activation='softmax', name='duration_output')(inner)
packets_output = Dense(lg_packets, activation='softmax', name='packets_output')(inner)
bytes_output = Dense(lg_bytes, activation='softmax', name='bytes_output')(inner)
src_port_output = Dense(nr_port, activation='softmax', name='src_port_output')(inner)
dst_port_output = Dense(nr_port, activation='softmax', name='dst_port_output')(inner)

# add dst computer output:
next_dst = Dense(embed_dim, activation='relu', name='next_dst')(inner)

# put it all together:
model = Model(input=[src_input,
                     dst_input,
                     src_pt_input,
                     dst_pt_input,
                     proto_input,
                     duration_input,
                     packets_input, 
                     bytes_input,
                     next_src], 
              output=[proto_output,
                      duration_output,
                      packets_output,
                      bytes_output,
                      src_port_output,
                      dst_port_output,
                      next_dst])
model.compile(optimizer='rmsprop', 
              loss=['categorical_crossentropy' for i in range(6)] + ['mse'],
              loss_weights=[0.5, 1., 1., 1., 2., 2., 4.])

# ...and summarise:
model.summary()
plot(model, to_file='model.png')

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
bytes (InputLayer)                 (None, 32)          0                                            
____________________________________________________________________________________________________
dst_pt (InputLayer)                (None, 32)          0                                            
____________________________________________________________________________________________________
duration (InputLayer)              (None, 32)          0                                            
____________________________________________________________________________________________________
packets (InputLayer)               (None, 32)          0                                            
___________________________________________________________________________________________

Using TensorFlow backend.


# Testing the architecture

In [20]:
import numpy as np

N = 100
train = np.array( subflows.map(normalise).take(N+1) )

def make_input_vectors(z, mx, unroll=unroll):
    n = len(z)
    X = np.zeros((n - unroll, unroll), dtype='float32')
    for i in range(n - unroll):
        X[i,:] = z[i: i + unroll]
    return X

src_in = make_input_vectors(train[range(N), 2], nr_box)
dst_in = make_input_vectors(train[range(N), 4], nr_box)
src_pt_in = make_input_vectors(train[range(N), 3], nr_port)
dst_pt_in = make_input_vectors(train[range(N), 5], nr_port)
proto_in = make_input_vectors(train[range(N), 6], nr_proto)
duration_in = make_input_vectors(train[range(N), 1], lg_duration)
packets_in = make_input_vectors(train[range(N), 7], lg_packets)
bytes_in = make_input_vectors(train[range(N),8], lg_bytes)
next_src_in = make_input_vectors(train[range(1,N+1),2], nr_box)

input = [src_in, dst_in, src_pt_in, dst_pt_in, proto_in, duration_in, packets_in, bytes_in, next_src_in]

In [23]:
# the following works in theano but gives an error with tensorflow...

#input
output = model.predict(input, batch_size=1)
#wts = model.get_weights()

#print([y.shape for y in output])
#print(output[0][1:10])
#print([w.shape for w in wts])

UnimplementedError: Cast uint8 to bool is not supported
	 [[Node: Cast = Cast[DstT=DT_BOOL, SrcT=DT_UINT8, _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_keras_learning_phase_0)]]
Caused by op u'Cast', defined at:
  File "/Developer/anaconda/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/Developer/anaconda/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Developer/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Developer/anaconda/lib/python2.7/site-packages/traitlets/config/application.py", line 592, in launch_instance
    app.start()
  File "/Developer/anaconda/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 403, in start
    ioloop.IOLoop.instance().start()
  File "/Developer/anaconda/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/Developer/anaconda/lib/python2.7/site-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/Developer/anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Developer/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Developer/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Developer/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Developer/anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Developer/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 260, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Developer/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 212, in dispatch_shell
    handler(stream, idents, msg)
  File "/Developer/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 370, in execute_request
    user_expressions, allow_stdin)
  File "/Developer/anaconda/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 175, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Developer/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2902, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Developer/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3006, in run_ast_nodes
    if self.run_code(code, result):
  File "/Developer/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-6f9ec0ed86b7>", line 46, in <module>
    inner = LSTM(nhidden[i], return_sequences=True, dropout_U=dropout_U, dropout_W=dropout_W)(inner)
  File "/Developer/anaconda/lib/python2.7/site-packages/keras/engine/topology.py", line 485, in __call__
    self.add_inbound_node(inbound_layers, node_indices, tensor_indices)
  File "/Developer/anaconda/lib/python2.7/site-packages/keras/engine/topology.py", line 543, in add_inbound_node
    Node.create_node(self, inbound_layers, node_indices, tensor_indices)
  File "/Developer/anaconda/lib/python2.7/site-packages/keras/engine/topology.py", line 148, in create_node
    output_tensors = to_list(outbound_layer.call(input_tensors[0], mask=input_masks[0]))
  File "/Developer/anaconda/lib/python2.7/site-packages/keras/layers/recurrent.py", line 218, in call
    constants = self.get_constants(x)
  File "/Developer/anaconda/lib/python2.7/site-packages/keras/layers/recurrent.py", line 770, in get_constants
    B_U = [K.in_train_phase(K.dropout(ones, self.dropout_U), ones) for _ in range(4)]
  File "/Developer/anaconda/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 802, in in_train_phase
    x = tf.python.control_flow_ops.cond(tf.cast(_LEARNING_PHASE, 'bool'),
  File "/Developer/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 300, in cast
    return gen_math_ops.cast(x, dtype, name=name)
  File "/Developer/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 193, in cast
    return _op_def_lib.apply_op("Cast", x=x, DstT=DstT, name=name)
  File "/Developer/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 633, in apply_op
    op_def=op_def)
  File "/Developer/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1710, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Developer/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 988, in __init__
    self._traceback = _extract_stack()


In [13]:
# of particular interest is the shared computer encoding, which will also be needed in the loss function:
cmp_code = wts[6]

print(cmp_code.shape)

(10109, 256)


# Training the model

In [15]:
def make_onehot(x, n):
    N = len(x)-unroll
    out = np.zeros([N,n])
    for i in range(1,N):
        out[i, x[i]] = 1
    return out

proto_tgt = make_onehot(train[range(1,N+1), 6], nr_proto)
duration_tgt = make_onehot(train[range(1,N+1), 1], lg_duration)
packets_tgt = make_onehot(train[range(1,N+1), 7], lg_packets)
bytes_tgt = make_onehot(train[range(1,N+1),8], lg_bytes)
src_pt_tgt = make_onehot(train[range(1,N+1), 3], nr_port)
dst_pt_tgt = make_onehot(train[range(1,N+1), 5], nr_port)

def make_nextdst(x, cmp_code):
    N = len(x)-unroll
    out = np.zeros([N, embed_dim])
    for i in range(N):
        out[i,:] = cmp_code[x[i],:]
    return out

dst_tgt = make_nextdst(train[range(1,N+1), 4], cmp_code)
target = [proto_tgt, duration_tgt, packets_tgt, bytes_tgt, src_pt_tgt, dst_pt_tgt, dst_tgt]

In [17]:
# model fitting -- after each epoch, reset the target dst using the updated computer embedding matrix:

nr_epochs = 5

for i in range(nr_epochs):
    cmp_code = model.get_weights()[6]
    target[6] = make_nextdst(train[range(1,N+1), 4], cmp_code)
    model.fit(input, target, nb_epoch=1, verbose=1)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


# Model evaluation