# DL Model for Binary Classification
---
- tensorboard include {scalars, histogram and embedding}

## Reference
- [basic tensorboard usage](https://github.com/mamcgrath/TensorBoard-TF-Dev-Summit-Tutorial/blob/master/mnist.py)
- [Great example with multiple embedding](https://github.com/tensorflow/tensorflow/issues/6322)
        

In [None]:
import numpy as np 
from sklearn.model_selection import train_test_split  

import tensorflow as tf
import matplotlib.pyplot as plt
import os
%matplotlib inline
print(tf.__version__)

# feature extract 
---
- Input feature is the unigram model extrract by ATSE

In [1]:

train_malicious = np.genfromtxt("./JSF20170324/malicious.vlog", delimiter=',')
train_normal    = np.genfromtxt("./JSF20170324/normal.vlog"   , delimiter=',')

X = np.concatenate((train_malicious, train_normal))
y = np.array([1]*len(train_malicious) + [0]*len(train_normal))
del(train_malicious)
del(train_normal)
y = np.reshape(y, [len(y), 1])
X = np.delete(X, np.s_[:1], axis=1)

print(X.shape)
print(y.shape)

(367028, 772)
(367028, 1)


In [2]:
## Split Part of training data as validation set 
## Size of data is big enought, no need to split k-fold for cross validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.05, random_state=831
)
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(348676, 772)
(18352, 772)
(348676, 1)
(18352, 1)


# Data preprocessing
---
- here we just log done the preprocessing

In [4]:
X_train = np.log(X_train+1)
X_valid = np.log(X_valid+1)

In [6]:
## Dimension Setting, (if multi classification, remember to choose loss with softmax)
input_dim = 772
output_dim = 1
N = len(X_train)
batch_size = 500

## Get Mini Batch
print("Number of batch : ", N//batch_size)
print("Total traning data: ", N)
print("Batch size :", batch_size)

def get_minibatches_index(n, batch_size, shuffle=False):
    """
    Used to shuffle the dataset at each iteration.
    """

    idx_list = np.arange(n, dtype="int32")

    if shuffle:
        np.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // batch_size):
        minibatches.append(idx_list[minibatch_start:
                                        minibatch_start + batch_size])
        minibatch_start += batch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return minibatches

minibatches = get_minibatches_index(len(X_train), batch_size, shuffle=True)

Number of batch :  697
Total traning data:  348676
Batch size : 500


# Model and Solver class
---
- if use the momentum as optimizer, just set the `momentum` parameter
- dense = activcation(inputs.kernel+bias), kernel = w*x
- [softmax and with cross entropy](http://stackoverflow.com/questions/34240703/difference-between-tensorflow-tf-nn-softmax-and-tf-nn-softmax-cross-entropy-with)
                    

In [41]:
class Model:
    def __init__(self, name, input_dim, output_dim, hidden_dims=[32, 32], use_batchnorm=True, \
                 activation_fn=tf.nn.relu, optimizer=tf.train.AdamOptimizer, lr=0.01, momentum=None):
        """ Constructor"""
        self.hidden_dims    = hidden_dims
        self.name           = name
        
        with tf.variable_scope(name):
            with tf.name_scope('input'):
                self.X = tf.placeholder(tf.float32, [None, input_dim], name='x')
                self.y = tf.placeholder(tf.float32, [None, output_dim], name='labels')
                self.mode = tf.placeholder(tf.bool, name='train_mode')            
            
            # Loop over hidden layers
            net = self.X
            for i, h_dim in enumerate(hidden_dims):
                with tf.variable_scope('layer{}'.format(i)):
                    net = tf.layers.dense(net, h_dim, name="fc{}".format(i))
                    if use_batchnorm:
                        net = tf.layers.batch_normalization(net, training=self.mode)
                    net = activation_fn(net, name="afc{}".format(i))
                    
            net = tf.contrib.layers.flatten(net)
            net = tf.layers.dense(net, output_dim, name='last_fc')
            tf.summary.histogram('last_fc',net)
            
            with tf.name_scope('loss'):
                # (for multiclass) self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=net, labels=self.y)
                self.loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=net, labels=self.y)
                self.loss = tf.reduce_mean(self.loss, name='loss')    

            with tf.name_scope('train'):
                # When using the batchnormalization layers,
                # it is necessary to manually add the update operations
                # because the moving averages are not included in the graph            
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=name)
                with tf.control_dependencies(update_ops):
                    if momentum is not None:
                        self.train_op = tf.train.MomentumOptimizer(lr,momentum).minimize(self.loss)
                    else:
                        self.train_op = optimizer(lr).minimize(self.loss)
            
            with tf.name_scope('accuracy'):
                self.predicted = tf.cast(net > 0.5, dtype=tf.float32)
                self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.predicted, self.y), dtype=tf.float32))
            
            with tf.name_scope('roc'):
                predi_bool = tf.cast(self.predicted, tf.bool)
                label_bool = tf.cast(self.y, tf.bool)
                # for TP
                TP_and = tf.logical_and(predi_bool, label_bool)
                # for TN
                tmp_or = tf.logical_or(predi_bool, label_bool)
                TN_not = tf.logical_not(tmp_or)
                # for FP
                label_not = tf.logical_not(label_bool)
                FP_and = tf.logical_and(predi_bool, label_not)
                # for FN
                predi_not = tf.logical_not(predi_bool)
                FN_and = tf.logical_and(predi_not, label_bool)
                # Add node
                self.TP = tf.reduce_sum(tf.cast(TP_and, tf.float32))
                self.TN = tf.reduce_sum(tf.cast(TN_not, tf.float32))
                self.FP = tf.reduce_sum(tf.cast(FP_and, tf.float32))
                self.FN = tf.reduce_sum(tf.cast(FN_and, tf.float32))
            
                self.precision = self.TP / (self.TP + self.FP)
                self.recall    = self.TP / (self.TP + self.FN)
            
            # Create Summary
            for i, h_dim in enumerate(hidden_dims):
                net = tf.get_default_graph().get_tensor_by_name(self.name+'/layer'+str(i)+'/fc'+str(i)+'/MatMul:0')
                tf.summary.histogram('W_layer{}'.format(i), net)
            
            tf.summary.scalar("cosr", self.loss)
            tf.summary.scalar("accuracy", self.accuracy)
            tf.summary.scalar("precision", self.precision)
            tf.summary.scalar("recall", self.recall)
           
            self.merged = tf.summary.merge_all()
            
            # construct Saver
            self.saver = tf.train.Saver()
            


In [42]:
class Solver:
    def __init__(self, sess, model, log_dir):
        self.model = model
        self.sess = sess
        self.epoch = 0
        self.iteration = 0
        self.log_dir = log_dir
        
        train_path = os.path.join(self.log_dir, self.model.name+'_train')
        self.train_writer = tf.summary.FileWriter(train_path, sess.graph)
        
        valid_path = os.path.join(self.log_dir, self.model.name+'_valid')
        self.valid_writer  = tf.summary.FileWriter(valid_path, sess.graph)
        
    def save(self, epoch):
        self.model.saver.save(self.sess, os.path.join(self.log_dir, "model["+self.model.name+"].ckpt"), epoch)
        
    def compute_embedding(self, X, y,name):
        ''' name could be train, validation or test ... '''
        log_dir = self.log_dir
        feed = {
            self.model.X: X,
            self.model.y: y,
            self.model.mode: True
        }
        number_of_embed_watch = len(X)
        accuracy = self.model.accuracy
        output_path = os.path.join(log_dir,self.model.name+'_embed', name)
        embed_tensors = []
        embed_writer = tf.summary.FileWriter(output_path, sess.graph)
        config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
        hidden_dims = self.model.hidden_dims
        hidden_outputs = list()

        for i in range(len(hidden_dims)):
            hidden_outputs.append(tf.get_default_graph().get_tensor_by_name(self.model.name+'/layer'+str(i)+'/fc'+str(i)+'/MatMul:0'))
        hidden_outputs.append(accuracy)
        hidden_vectors = sess.run(hidden_outputs, feed_dict=feed)
        for layer, embed_vectors in enumerate(hidden_vectors[:-1]):
            embed_tensor = tf.Variable(tf.zeros([number_of_embed_watch, hidden_dims[layer]]), 
                                       name="%s_layer_%s" % (name, layer))
            embed_tensor_assign = embed_tensor.assign(embed_vectors)
            embed_tensors.append(embed_tensor)
            sess.run(embed_tensor_assign)
            embedding = config.embeddings.add()
            embedding.tensor_name = embed_tensor.name
            embedding.metadata_path = os.path.join(output_path, self.model.name+'_labels.tsv')
            tf.contrib.tensorboard.plugins.projector.visualize_embeddings(embed_writer, config)
        result = sess.run(embed_tensors)
        saver  = tf.train.Saver(embed_tensors)
        # this must be named `model.ckpt`
        saver.save(sess, os.path.join(output_path, 'model.ckpt'), layer)
        
        # Make Label 
        class_literal=["NORMAL", "MALICIOUS"]
        with open(os.path.join(output_path, self.model.name+'_labels.tsv'), 'w') as metadata_file:
            metadata_file .write('Name\tClass\n')
            for l in range(len(y)):
                metadata_file.write('%d\t%s\n' % (y[l], class_literal[int(y[l])]))
                
                
    def train(self, X, y):
        self.iteration += 1
        merged = self.model.merged
        loss = self.model.loss
        accuracy = self.model.accuracy
        TP = self.model.TP
        TN = self.model.TN
        FP = self.model.FP
        FN = self.model.FN
        feed = {
            self.model.X: X,
            self.model.y: y,
            self.model.mode: True
        }
        train_op = self.model.train_op
        loss = self.model.loss
        if self.iteration % 5 == 0:
            summary, itr_loss, itr_acc, itr_tp, itr_tn, itr_fp, ite_fn = \
                self.sess.run([merged, loss, accuracy, TP, TN, FP, FN], feed_dict=feed)
            self.train_writer.add_summary(summary, self.iteration)
            
        itr_train_op, itr_loss = self.sess.run([train_op, loss], feed_dict=feed)
        
        return (itr_train_op, itr_loss)
    
    def evaluate(self, X, y, batch_size=None):
        loss = self.model.loss
        accuracy = self.model.accuracy
        TP = self.model.TP
        TN = self.model.TN
        FP = self.model.FP
        FN = self.model.FN
        '''
        The x-axis of the plots below are always in units of epochs, 
        which measure how many times every example has been seen during training in expectation 
        (e.g. one epoch means that every example has been seen once). 
        It is preferable to track epochs rather than iterations since the number of iterations
        depends on the arbitrary setting of batch size.
        '''
        if batch_size:
            N = X.shape[0] 
            total_loss = 0
            total_acc = 0
            total_TP = 0
            total_TN = 0
            total_FP = 0
            total_FN = 0
            for i in range(0, N, batch_size):
                # Siple Sequencial Batch
                X_batch = X[i:i + batch_size]
                y_batch = y[i:i + batch_size]
                feed = {
                    self.model.X: X_batch,
                    self.model.y: y_batch,
                    self.model.mode: False
                }
                step_loss, step_acc, step_tp, step_tn, step_fp, step_fn = \
                    self.sess.run([loss, accuracy, TP, TN, FP, FN], feed_dict=feed)
                
                total_loss += step_loss * X_batch.shape[0]
                total_acc += step_acc * X_batch.shape[0]
                total_TP += step_tp
                total_TN += step_tn
                total_FP += step_fp
                total_FN += step_fn
                
                # self.writer.add_summary(summary, i) # Batch 
            total_loss /= N
            total_acc /= N
            return (total_loss, total_acc, total_TP, total_TN, total_FP, total_FN)
            
        else:
            merged = self.model.merged
            feed = {
                self.model.X: X,
                self.model.y: y,
                self.model.mode: False
            }
            
            loss = self.model.loss            
            accuracy = self.model.accuracy
            TP = self.model.TP
            TN = self.model.TN
            FP = self.model.FP
            FN = self.model.FN
            epoch_summary, total_loss, total_acc, total_TP, total_TN, total_FP, total_FN = \
                self.sess.run([merged, loss, accuracy, TP, TN, FP, FN], feed_dict=feed)
            self.valid_writer.add_summary(epoch_summary, self.epoch)
            
            return (total_loss, total_acc, total_TP, total_TN, total_FP, total_FN)

# create model and solver
---
- useadamOptimizer ..

```Model('dnnAd', input_dim, output_dim, hidden_dims=[1024, 1024, 1024, 1024], use_batchnorm=True, lr=0.1)```

- use momentum

```Model('dnnMo', input_dim, output_dim, hidden_dims=[1024, 1024, 1024, 1024], use_batchnorm=True, lr=0.1, momentum=0.5)```

In [49]:
### Here Should be the Main Function 

# Epoch Setting 
epoch_n = 50

# Clean graph and Creat Session 
tf.reset_default_graph()
sess = tf.InteractiveSession()

log_dir = "./file_writer_dnn/"
if tf.gfile.Exists(log_dir):
    tf.gfile.DeleteRecursively(log_dir)
tf.gfile.MakeDirs(log_dir)


# Create Graph 
bn = Model('dnnMo', input_dim, output_dim, hidden_dims=[1024, 1024, 1024, 1024], \
           use_batchnorm=True, lr=0.1, momentum=0.5)


# Create Train Module 
bn_solver = Solver(sess, bn, log_dir)

# Save for record
train_losses = []
train_accs = []
train_ROC = []
valid_losses = []
valid_accs = []
valid_ROC = []

# Train
`tensorboard --logdir="./file_writer_dnn/"`

In [51]:
init = tf.global_variables_initializer()
sess.run(init)

for epoch in range(epoch_n):
    # Train
    for inds in minibatches[:-1]:
        X_batch = X_train[inds, :]
        y_batch = y_train[inds]
        _, bn_loss = bn_solver.train(X_batch, y_batch)
    
    bn_solver.epoch = epoch
    bn_solver.save(epoch)
    
    # Train Loss
    b_loss, b_acc, b_tp, b_tn, b_fp, b_fn = bn_solver.evaluate(X_train, y_train, batch_size)
    train_losses.append(b_loss)
    train_accs.append(b_acc)
    train_ROC.append([b_tp, b_tn, b_fp, b_fn])
    print('[Epoch %d-TRAIN] Batchnorm Loss(Acc): %.5f(%.2f)' % (epoch, b_loss, b_acc))
    print('Batchnorm Loss(Acc): %.5f(%.2f), TP: %d, TN: %d, FP, %d, FN: %d' % (b_loss, b_acc,b_tp, b_tn, b_fp, b_fn))

    # Evaluate 
    b_loss, b_acc, b_tp, b_tn, b_fp, b_fn = bn_solver.evaluate(X_valid, y_valid)
    valid_losses.append(b_loss)
    valid_accs.append(b_acc)
    valid_ROC.append([b_tp, b_tn, b_fp, b_fn])
    print('[Epoch %d-VALID] Batchnorm Loss(Acc): %.5f(%.2f)' % (epoch, b_loss, b_acc))
    print('Batchnorm Loss(Acc): %.5f(%.2f), TP: %d, TN: %d, FP, %d, FN: %d' % (b_loss, b_acc,b_tp, b_tn, b_fp, b_fn))
    
    print()

[Epoch 0-TRAIN] Batchnorm Loss(Acc): 0.01906(0.99)
Batchnorm Loss(Acc): 0.01906(0.99), TP: 132407, TN: 213296, FP, 413, FN: 2560
[Epoch 0-VALID] Batchnorm Loss(Acc): 0.02019(0.99)
Batchnorm Loss(Acc): 0.02019(0.99), TP: 7017, TN: 11158, FP, 25, FN: 152

[Epoch 1-TRAIN] Batchnorm Loss(Acc): 0.01516(0.99)
Batchnorm Loss(Acc): 0.01516(0.99), TP: 133088, TN: 213384, FP, 325, FN: 1879
[Epoch 1-VALID] Batchnorm Loss(Acc): 0.01818(0.99)
Batchnorm Loss(Acc): 0.01818(0.99), TP: 7059, TN: 11160, FP, 23, FN: 110

[Epoch 2-TRAIN] Batchnorm Loss(Acc): 0.01815(0.99)
Batchnorm Loss(Acc): 0.01815(0.99), TP: 132307, TN: 213423, FP, 286, FN: 2660
[Epoch 2-VALID] Batchnorm Loss(Acc): 0.02204(0.99)
Batchnorm Loss(Acc): 0.02204(0.99), TP: 7011, TN: 11164, FP, 19, FN: 158

[Epoch 3-TRAIN] Batchnorm Loss(Acc): 0.01197(1.00)
Batchnorm Loss(Acc): 0.01197(1.00), TP: 134360, TN: 213148, FP, 561, FN: 607
[Epoch 3-VALID] Batchnorm Loss(Acc): 0.01754(0.99)
Batchnorm Loss(Acc): 0.01754(0.99), TP: 7120, TN: 11138, FP

# Create Embedding on tensorboard

In [52]:
# Embedding ~
bn_solver.compute_embedding(X_valid, y_valid, "validation")

# Testing 

In [53]:
# load test
test_malicious = np.genfromtxt("./JSF20170324/test/malicious.vlog", delimiter=',')
test_normal    = np.genfromtxt("./JSF20170324/test/normal.vlog"   , delimiter=',')

X_test = np.concatenate((test_malicious, test_normal))
y_test = np.array([1]*len(test_malicious) + [0]*len(test_normal))
del(test_malicious)
del(test_normal)
y_test = np.reshape(y_test, [len(y_test), 1])
X_test = np.delete(X_test, np.s_[:1], axis=1)
X_test = np.log(X_test+1)
print(X_test.shape)
print(y_test.shape)

(206762, 772)
(206762, 1)


In [54]:
b_loss, b_acc, b_tp, b_tn, b_fp, b_fn = bn_solver.evaluate(X_test, y_test, batch_size)

In [56]:
print(b_loss, b_acc, b_tp, b_tn, b_fp, b_fn)
print("Test Result : ")
print("- accuract        :%f" % b_acc)
print("- True positive   :%d" % b_tp)
print("- True negative   :%d" % b_tn)
print("- False positive  :%d" % b_fp)
print("- False negative  :%d" % b_fn)
print("- Precision       :%f" % (b_tp/(b_tp+b_fp)))
print("- Recall          :%f" % (b_tp/(b_tp+b_fn)))

0.144941735078 0.963595872329 86896.0 112339.0 827.0 6700.0
Test Result : 
- accuract        :0.963596
- True positive   :86896
- True negative   :112339
- False positive  :827
- False negative  :6700
- Precision       :0.990573
- Recall          :0.928416


In [None]:
# accrossing to above, maybe out model not generalize enought