# Assignment 07
## Robert Knox

## Data Processing

In [15]:
import sys
import os
import json
import pandas
import numpy
import optparse

from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

In [16]:
dataframe = pandas.read_csv("dev-access.csv", engine='python', quotechar='|', header=None)

In [17]:
dataset = dataframe.values

In [18]:
dataset.shape

(26773, 2)

In [19]:
dataframe.describe()

Unnamed: 0,1
count,26773.0
mean,0.49901
std,0.500008
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [20]:
print(dataframe[0][10])

{"timestamp":1502738402858,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"135.83.221.190"},"route":"/login","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6","content-type":"application/json","content-length":"45"},"requestPayload":{"username":"33Michele","password":"lelgoec"},"responsePayload":{"statusCode":401,"error":"Unauthorized","message":"Invalid Login"}}


In [21]:
X = dataset[:,0]

In [22]:
Y = dataset[:,1]

In [23]:
for index, item in enumerate(X):
    # Quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [24]:
print(X[0])
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

# we will need this later
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"Carl2","password":"bo"}}


X is now broken down into the relevant portions of the log which is then tokenized to be the char value for each character.

In [11]:
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [26]:
X[0]

[18,
 1,
 20,
 2,
 3,
 14,
 7,
 11,
 1,
 4,
 1,
 13,
 7,
 6,
 3,
 1,
 10,
 1,
 16,
 8,
 2,
 9,
 15,
 1,
 4,
 18,
 19,
 10,
 1,
 13,
 5,
 3,
 14,
 1,
 4,
 1,
 25,
 12,
 7,
 26,
 24,
 21,
 1,
 10,
 1,
 6,
 3,
 5,
 3,
 8,
 6,
 17,
 7,
 11,
 2,
 1,
 4,
 23,
 22,
 29,
 10,
 1,
 9,
 2,
 16,
 8,
 2,
 6,
 3,
 13,
 5,
 15,
 12,
 7,
 5,
 11,
 1,
 4,
 18,
 1,
 8,
 6,
 2,
 9,
 21,
 5,
 20,
 2,
 1,
 4,
 1,
 17,
 5,
 9,
 12,
 28,
 1,
 10,
 1,
 13,
 5,
 6,
 6,
 32,
 7,
 9,
 11,
 1,
 4,
 1,
 40,
 7,
 1,
 19,
 19]

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, Y, test_size=0.25, random_state=42)

## Model 1 - RNN

embedding layer, LSTM layer, and Dense layer

In [30]:
m1 = Sequential()
m1.add(Embedding(input_dim=num_words, output_dim = 32, input_length  = max_log_length))
m1.add(LSTM(units = 64, recurrent_dropout=0.5))
m1.add(Dense(units = 1, activation = 'relu'))

m1.compile(loss = 'binary_crossentropy',
          optimizer = 'adam',
          metrics = ['accuracy'])

m1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [31]:
#Use a validation split of 0.25, epochs=3 and batch size = 128
m1.fit(X_train,y_train,batch_size=128,epochs = 3, validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x23db0e409b0>

In [32]:
m1.evaluate(X_test,y_test)



[0.44348750613609783, 0.5751419181356439]

In [96]:
yhat = m1.predict(X_test)

In [103]:
yhat = m1.predict(X_test)
yhat = [0 if i==0 else 1 for i in yhat]

In [104]:
yhat = numpy.array(yhat)

In [105]:
y_test_int = y_test.astype(int)
y_test_int

array([0, 0, 1, ..., 0, 0, 1])

In [106]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

cr = classification_report(y_test_int,yhat)
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.03      0.05      3423
           1       0.50      1.00      0.66      3271

   micro avg       0.50      0.50      0.50      6694
   macro avg       0.75      0.51      0.36      6694
weighted avg       0.75      0.50      0.35      6694



## Model 2 

In [33]:
m2 = Sequential()
m2.add(Embedding(input_dim=num_words, output_dim = 32, input_length  = max_log_length))
m2.add(Dropout(rate = 0.5))                 
m2.add(LSTM(units = 64, recurrent_dropout=0.5))
m2.add(Dropout(rate = 0.5))
m2.add(Dense(units = 1, activation = 'sigmoid'))

m2.compile(loss = 'binary_crossentropy',
          optimizer = 'adam',
          metrics = ['accuracy'])

m2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [34]:
#Use a validation split of 0.25, epochs=3 and batch size = 128
m2.fit(X_train,y_train,batch_size=128,epochs = 3, validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x23dea280e48>

In [35]:
m2.evaluate(X_test,y_test)



[0.16965968555052635, 0.9530923214819241]

In [87]:
yhat2 = m2.predict(X_test)
yhat2 = [1 if i>=0.5 else 0 for i in yhat2]
yhat2 = numpy.array(yhat2)

In [88]:
cr2 = classification_report(y_test_int,yhat2)
print(cr2)

              precision    recall  f1-score   support

           0       0.92      0.99      0.96      3423
           1       0.99      0.91      0.95      3271

   micro avg       0.95      0.95      0.95      6694
   macro avg       0.96      0.95      0.95      6694
weighted avg       0.96      0.95      0.95      6694



## Model 3

In [91]:
m3 = Sequential()
m3.add(Embedding(input_dim=num_words, output_dim = 32, input_length  = max_log_length))
m3.add(Dropout(rate = 0.5))                 
m3.add(LSTM(units = 64, recurrent_dropout=0.5))
m3.add(Dropout(rate = 0.5))
m3.add(Dense(units = 128, activation = 'relu'))
m3.add(Dropout(rate = 0.5))
m3.add(Dense(units = 1, activation = 'sigmoid'))

m3.compile(loss = 'binary_crossentropy',
          optimizer = 'Adadelta',
          metrics = ['accuracy'])

m3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
dropout_12 (Dropout)         (None, 1024, 32)          0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 64)                24832     
_________________________________________________________________
dropout_13 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_14 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 129       
Total para

In [92]:
m3.fit(X_train,y_train,batch_size=128,epochs = 3, validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x23dbab5cda0>

In [93]:
m3.evaluate(X_test,y_test)



[0.32928738306984107, 0.9141021810576636]

In [94]:
yhat3 = m3.predict(X_test)
yhat3 = [1 if i>=0.5 else 0 for i in yhat3]
yhat3 = numpy.array(yhat3)

In [95]:
cr3 = classification_report(y_test_int,yhat3)
print(cr3)

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      3423
           1       0.99      0.84      0.90      3271

   micro avg       0.91      0.91      0.91      6694
   macro avg       0.92      0.91      0.91      6694
weighted avg       0.92      0.91      0.91      6694



###  Explain the difference between the relu activation function and the sigmoid activation function.

Activation functions have several properties that must be considered for their use in a neural network. These include:

* Differentiablity - If the activation function is differentiable for all real numbers it simplifies the process of back propagation

* Computational Efficiency - The activation function must be calculated for each neuron in each epoch so it is desirable to  have a computationallly efficient function for the sake of expediency.

* Symmetry around the origin - Functions that are symmetric around zero tend to perform better as the neuron has a less limited output.

* Vanishing & Exploding Gradients - As more derivatives are calculated the value of gradient tends to either diminish to zero producing a result that does not get close to a local minimum of the cost function or the gradient may explode and move away from the local minimum.

The relu activation function is non-differentiable at zero and non-symmetric around the origin but is preferred in deep neural networks because it is computationally efficient & helps prevent vanishing gradients.

The sigmoid activation function is differentiable for all $\mathbb{R}$ and is easily differentiable. However, since it only produces results between (0,1) it reduces the input space drastically. This produces a vanishing gradient making it unsuitable for deep neural networks.

### In regards to question 5, which of these activation functions performed the best (they were used in Model 1 & Model 2) ? Why do you think that is?

The sigmoid activation function performed better than relu in this assignment. I believe it worked better as the final goal of the model was to predict a 0 or 1 for each record. The sigmoid function is ideally suited for this as its output range is (0,1) whereas relu produces outputs (0,$\infty$).

### Explain how dropout works (you can look at the keras code) for (a) training, and (b) test data sets.

In training, the dropout creates a noise shape that is a binary mask the input tensor. The probability that a neuron will be masked is based on the rate. This mask prevents the neuron from being considered for the feed forward calculation as well as preventing updates in the backpropagation step.

In test, there is no dropout of neurons; the dropout just affects the neurons in training to make the model have a more robust generalization error. In testing we simply let the network make its decision based on the training.

### Explain why problems such as this are better modeled with RNNs than CNNs.

RNNs allow us to retain memory from previous states in the data are not IID. A CNN would let us take a highly dimensional feature space and reduce it down to smaller elements but the assumption is that the data are IID.

### Explain what RNN problem is solved using LSTM and briefly describe how.

RNNs suffer from problems when the relevant information is not from a state that directly precedes the current state. LSTM solves this issue by retaining memory across all states to account for long term dependencies.