# Load indexes and normalisations

In [1]:
# where the data lives:

path = "~/data/LANL/"

In [19]:
# load box/port info from json:

import json
import re

data = json.load(open("boxport_data.json", 'r'))

def jsonIntKeys(x):
    if isinstance(x, dict):
            return {int(k):str(v) for k,v in x.items()}
    return x

def jsonIntVals(x):
    if isinstance(x, dict):
            return {str(k):int(v) for k,v in x.items()}
    return x

box_index = jsonIntVals( data['box_indices'] )
index_box = jsonIntKeys( data['indices_box'] )
port_index = jsonIntVals( data['port_indices'] )
index_port = jsonIntKeys( data['indices_port'] )

bbox = set([re.sub('[\"\n]+', "", s) for s in box_indices.keys()])

In [20]:
# define netflow RDD - filter to boxes in 'bbox' (busy computers):

flowfile = path + "flows.csv"
flows = sc.textFile(flowfile).map(lambda line: [str(x) for x in line.split(',')])

def filt(x):
    return [int(x[0]), int(x[1]), x[2], x[3], x[4], x[5], int(x[6]), int(x[7]), int(x[8])]

subflows = flows.filter(lambda f: f[2] in bbox and f[4] in bbox).map(filt)

In [23]:
# define normalisation mapping to be readable in Keras:

from math import log, floor

protocols = [1,6,17,41]
proto_index = dict((c, i) for i, c in enumerate(protocols))
index_proto = dict((i, c) for i, c in enumerate(protocols))

def logbin(x):
    return int(floor(log(x, 2)))

def normal_port(x):
    if re.match('^N', x)!=None: 
        return 'N' 
    else: 
        return x

def normalise(x):
    return [x[0], 
            logbin(x[1]+1),
            box_indices[x[2]],
            port_indices[normal_port(x[3])],
            box_indices[x[4]],
            port_indices[normal_port(x[5])],
            proto_index[x[6]], 
            logbin(x[7]), 
            logbin(x[8])]

# check:
subflows.map(normalise).take(10)

[[1, 0, 9501, 138, 2205, 122, 1, 3, 12],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 2244, 122, 8986, 122, 1, 2, 9],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 5354, 122, 9484, 0, 1, 0, 5],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2244, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8],
 [1, 0, 8986, 122, 2408, 122, 1, 2, 8]]

Note that the schema is:

<ul>
<li> timestamp
<li> duration (log)
<li> source computer
<li> source port
<li> destination computer
<li> destination port
<li> protocol
<li> nr packets (log)
<li> nr bytes (log)
</ul>

In [10]:
# set input vector sizes:

lg_duration = 7
lg_packets = 22
lg_bytes = 32
nr_proto = 4
nr_box = len(indices_box)   # 10109
nr_port = len(indices_port) #   326

# RNN architecture

In [15]:
from keras.layers import Input, Embedding, LSTM, Dense, merge
from keras.models import Model

main_input = Input(shape=(100,), dtype='int32', name='main_input')
x = Embedding(output_dim=512, input_dim=10000, input_length=100)(main_input)
lstm_out = LSTM(32)(x)
auxiliary_loss = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

auxiliary_input = Input(shape=(5,), name='aux_input')
x = merge([lstm_out, auxiliary_input], mode='concat')

# we stack a deep fully-connected network on top
x = Dense(64, activation='relu')(x)

# and finally we add the main logistic regression layer
main_loss = Dense(1, activation='sigmoid', name='main_output')(x)
model = Model(input=[main_input, auxiliary_input], output=[main_loss, auxiliary_loss])
model.compile(optimizer='rmsprop', loss='binary_crossentropy',
              loss_weights=[1., 0.2])
model.summary()

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
main_input (InputLayer)            (None, 100)         0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)            (None, 100, 512)    5120000     main_input[0][0]                 
____________________________________________________________________________________________________
aux_input (InputLayer)             (None, 5)           0                                            
____________________________________________________________________________________________________
lstm_2 (LSTM)                      (None, 32)          69760       embedding_2[0][0]                
___________________________________________________________________________________________

In [16]:
from keras.utils.visualize_util import plot

plot(model, to_file='model.png')

In [14]:
?plot