In [1]:
import csv

infile = "flows_for_rnn_model_100000.csv"

fp = open(infile, 'r')
flowdata = [[int(x) for x in line] for line in csv.reader(fp)]
fp.close()

flowdata[:10]

[[1, 0, 9501, 138, 2205, 122, 1, 3, 12],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8]]

Note that the schema is:

<ul>
<li> timestamp
<li> duration (log)
<li> source computer
<li> source port
<li> destination computer
<li> destination port
<li> protocol
<li> nr packets (log)
<li> nr bytes (log)
</ul>

In [2]:
# load box/port info from json:

import json
import re

boxdata = json.load(open("boxport_data.json", 'r'))

def jsonIntKeys(x):
    if isinstance(x, dict):
            return {int(k):str(v) for k,v in x.items()}
    return x

def jsonIntVals(x):
    if isinstance(x, dict):
            return {str(k):int(v) for k,v in x.items()}
    return x

box_index = jsonIntVals( boxdata['box_indices'] )
index_box = jsonIntKeys( boxdata['indices_box'] )
port_index = jsonIntVals( boxdata['port_indices'] )
index_port = jsonIntKeys( boxdata['indices_port'] )

bbox = set([re.sub('[\"\n]+', "", s) for s in box_index.keys()])

In [3]:
# set input vector sizes:

lg_duration = 7
lg_packets = 22
lg_bytes = 32
nr_proto = 4
nr_box = len(index_box)   # 10109
nr_port = len(index_port) #   326

In [4]:
import sys
sys.setrecursionlimit(10000) # needed to prevent query-time bug...

from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.models import Model
from keras.engine.topology import merge
from keras.utils.visualize_util import plot

# hyperparameters:
nhidden = [512, 512]
unroll = 32
embed_dim = 256
dropout_W = 0.2 # input gates
dropout_U = 0.2 # recurrent connections

# netflow inputs:
src_input = Input(shape=(unroll,), dtype='int32', name='src_input')
dst_input = Input(shape=(unroll,), dtype='int32', name='dst_input')
src_pt_input = Input(shape=(unroll,), dtype='int32', name='src_pt')
dst_pt_input = Input(shape=(unroll,), dtype='int32', name='dst_pt')
proto_input = Input(shape=(unroll,), dtype='int32', name='proto')
duration_input = Input(shape=(unroll,), dtype='int32', name='duration')
packets_input = Input(shape=(unroll,), dtype='int32', name='packets')
bytes_input = Input(shape=(unroll,), dtype='int32', name='bytes')

# shared embedding for computer feeds:
comp_encoding = Embedding(output_dim=embed_dim, input_dim=nr_box, input_length=unroll)
src = comp_encoding(src_input)
dst = comp_encoding(dst_input)

# other embeddings:
src_pt = Embedding(output_dim=128, input_dim=nr_port, input_length=unroll)(src_pt_input)
dst_pt = Embedding(output_dim=128, input_dim=nr_port, input_length=unroll)(dst_pt_input)
proto = Embedding(output_dim=2, input_dim=nr_proto, input_length=unroll)(proto_input)
duration = Embedding(output_dim=4, input_dim=lg_duration, input_length=unroll)(duration_input)
packets = Embedding(output_dim=8, input_dim=lg_packets, input_length=unroll)(packets_input)
bytes = Embedding(output_dim=16, input_dim=lg_bytes, input_length=unroll)(bytes_input)

# merge:
data_merged = merge([src_pt, dst_pt, proto, duration, packets, bytes], mode='concat')

# add src computer for next time-step, as a query stream to train on:
next_src = Input(shape=(unroll,), dtype='int32', name='next_src')
query = comp_encoding(next_src)

# pass data and query to RNN layers:
inner = merge([data_merged, src, dst, query], mode='concat')
for i in range(len(nhidden)-1):
    inner = LSTM(nhidden[i], return_sequences=True, dropout_U=dropout_U, dropout_W=dropout_W)(inner)
inner = LSTM(nhidden[-1], return_sequences=False, dropout_U=dropout_U, dropout_W=dropout_W)(inner)
inner = Dropout(dropout_W)(inner)

# add softmax outputs:
proto_output = Dense(4, activation='softmax', name='proto_output')(inner)
duration_output = Dense(lg_duration, activation='softmax', name='duration_output')(inner)
packets_output = Dense(lg_packets, activation='softmax', name='packets_output')(inner)
bytes_output = Dense(lg_bytes, activation='softmax', name='bytes_output')(inner)
src_port_output = Dense(nr_port, activation='softmax', name='src_port_output')(inner)
dst_port_output = Dense(nr_port, activation='softmax', name='dst_port_output')(inner)

# add dst computer output:
next_dst = Dense(embed_dim, activation='relu', name='next_dst')(inner)

# put it all together:
model = Model(input=[src_input,
                     dst_input,
                     src_pt_input,
                     dst_pt_input,
                     proto_input,
                     duration_input,
                     packets_input, 
                     bytes_input,
                     next_src], 
              output=[proto_output,
                      duration_output,
                      packets_output,
                      bytes_output,
                      src_port_output,
                      dst_port_output,
                      next_dst])
model.compile(optimizer='rmsprop', 
              loss=['categorical_crossentropy' for i in range(6)] + ['mse'],
              loss_weights=[0.5, 1., 1., 1., 2., 2., 4.])

# ...and summarise:
model.summary()
plot(model, to_file='model.png')

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
bytes (InputLayer)                 (None, 32)          0                                            
____________________________________________________________________________________________________
dst_pt (InputLayer)                (None, 32)          0                                            
____________________________________________________________________________________________________
duration (InputLayer)              (None, 32)          0                                            
____________________________________________________________________________________________________
packets (InputLayer)               (None, 32)          0                                            
___________________________________________________________________________________________

Using Theano backend.


# Testing the architecture

In [14]:
import numpy as np

N = len(flowdata) - 1
train = np.array(flowdata[:N+1])

def make_input_vectors(z, mx, unroll=unroll):
    n = len(z)
    X = np.zeros((n - unroll, unroll), dtype='int32')
    for i in range(n - unroll):
        X[i,:] = z[i: i + unroll]
    return X

src_in = make_input_vectors(train[range(N), 2], nr_box)
dst_in = make_input_vectors(train[range(N), 4], nr_box)
src_pt_in = make_input_vectors(train[range(N), 3], nr_port)
dst_pt_in = make_input_vectors(train[range(N), 5], nr_port)
proto_in = make_input_vectors(train[range(N), 6], nr_proto)
duration_in = make_input_vectors(train[range(N), 1], lg_duration)
packets_in = make_input_vectors(train[range(N), 7], lg_packets)
bytes_in = make_input_vectors(train[range(N),8], lg_bytes)
next_src_in = make_input_vectors(train[range(1,N+1),2], nr_box)

input = [src_in, dst_in, src_pt_in, dst_pt_in, proto_in, duration_in, packets_in, bytes_in, next_src_in]

In [19]:
# theano:

test = [x[:100] for x in input]
output = model.predict(test)
wts = model.get_weights()

print([y.shape for y in output])
print(output[0][1:10])

[(100, 4), (100, 7), (100, 22), (100, 32), (100, 326), (100, 326), (100, 256)]
[[  1.95558313e-02   9.03169990e-01   7.71048889e-02   1.69265884e-04]
 [  1.96206402e-02   9.03165579e-01   7.70446807e-02   1.69080493e-04]
 [  1.96515322e-02   9.03333843e-01   7.68462494e-02   1.68348633e-04]
 [  1.96394380e-02   9.03406620e-01   7.67855272e-02   1.68427214e-04]
 [  1.96213331e-02   9.03596759e-01   7.66136646e-02   1.68210478e-04]
 [  1.96027644e-02   9.03900743e-01   7.63285533e-02   1.67958424e-04]
 [  1.95593331e-02   9.04235005e-01   7.60390535e-02   1.66587284e-04]
 [  1.96104385e-02   9.04052198e-01   7.61699080e-02   1.67455364e-04]
 [  1.95637364e-02   9.04038429e-01   7.62307942e-02   1.67035265e-04]]


In [23]:
# tensorflow:

output = model(input)
output

[<tensorflow.python.framework.ops.Tensor at 0x12b2e2490>,
 <tensorflow.python.framework.ops.Tensor at 0x12b2d0910>,
 <tensorflow.python.framework.ops.Tensor at 0x12b2e2190>,
 <tensorflow.python.framework.ops.Tensor at 0x12b2c11d0>,
 <tensorflow.python.framework.ops.Tensor at 0x12b2e2ad0>,
 <tensorflow.python.framework.ops.Tensor at 0x12b2c1350>,
 <tensorflow.python.framework.ops.Tensor at 0x12b2d0850>]

In [20]:
# of particular interest is the shared computer encoding, which will also be needed in the loss function:
wts = model.get_weights()
for w in wts: print(w.shape)

(326, 128)
(326, 128)
(4, 2)
(7, 4)
(22, 8)
(32, 16)
(10109, 256)
(1054, 512)
(512, 512)
(512,)
(1054, 512)
(512, 512)
(512,)
(1054, 512)
(512, 512)
(512,)
(1054, 512)
(512, 512)
(512,)
(512, 512)
(512, 512)
(512,)
(512, 512)
(512, 512)
(512,)
(512, 512)
(512, 512)
(512,)
(512, 512)
(512, 512)
(512,)
(512, 32)
(32,)
(512, 326)
(326,)
(512, 7)
(7,)
(512, 256)
(256,)
(512, 22)
(22,)
(512, 4)
(4,)
(512, 326)
(326,)


In [21]:
cmp_code = wts[6]
print(cmp_code.shape)

(10109, 256)


# Training the model

In [22]:
def make_onehot(x, n):
    N = len(x)-unroll
    out = np.zeros([N,n])
    for i in range(1,N):
        out[i, x[i]] = 1
    return out

proto_tgt = make_onehot(train[range(1,N+1), 6], nr_proto)
duration_tgt = make_onehot(train[range(1,N+1), 1], lg_duration)
packets_tgt = make_onehot(train[range(1,N+1), 7], lg_packets)
bytes_tgt = make_onehot(train[range(1,N+1),8], lg_bytes)
src_pt_tgt = make_onehot(train[range(1,N+1), 3], nr_port)
dst_pt_tgt = make_onehot(train[range(1,N+1), 5], nr_port)

def make_nextdst(x, cmp_code):
    N = len(x)-unroll
    out = np.zeros([N, embed_dim])
    for i in range(N):
        out[i,:] = cmp_code[x[i],:]
    return out

dst_tgt = make_nextdst(train[range(1,N+1), 4], cmp_code)
target = [proto_tgt, duration_tgt, packets_tgt, bytes_tgt, src_pt_tgt, dst_pt_tgt, dst_tgt]

In [23]:
# model fitting -- after each epoch, reset the target dst using the updated computer embedding matrix:
# for 10^5 records, takes 50 mins/epoch

nr_epochs = 5

for i in range(nr_epochs):
    cmp_code = model.get_weights()[6]
    target[6] = make_nextdst(train[range(1,N+1), 4], cmp_code)
    model.fit(input, target, nb_epoch=1, verbose=1)

Epoch 1/1
 1600/99967 [..............................] - ETA: 3007s - loss: 11.0297 - proto_output_loss: 0.5988 - duration_output_loss: 1.3936 - packets_output_loss: 1.6543 - bytes_output_loss: 2.2250 - src_port_output_loss: 1.2881 - dst_port_output_loss: 1.4357 - next_dst_loss: 0.0024

KeyboardInterrupt: 