### End to end train both networks

In [1]:
from keras.layers import Input, Dense, Activation, Lambda, Dropout, Concatenate, Reshape
from keras.models import Model, Sequential
from keras.callbacks import TensorBoard, Callback
from keras import optimizers
from keras import backend as K

import tensorflow as tf

from keras.regularizers import l2


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
### Hyperparameters

experiment_id = 0

learning_rate = 0.0001

epoch_list = [200, 20000, 10000, 4000]
batch_size_list = [1024, 1024, 128, 1024]

lambda_1 = 0.1       
lambda_2 = 0.005     

#################

epochs = epoch_list[experiment_id]
batch_size = batch_size_list[experiment_id]

### Define compresion network

In [3]:
input_data = Input(batch_shape=(batch_size,120), name='input_placeholder')

encoded = Dense(60, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(input_data)
encoded = Dense(30, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(encoded)
encoded = Dense(10, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(encoded)

layer_lowdim = Dense(1, activation='linear', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001), name='lowdim')(encoded)

decoded = Dense(10, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(layer_lowdim)
decoded = Dense(30, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(decoded)
decoded = Dense(60, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(decoded)
decoded = Dense(120, activation='linear', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001), name='reconstructed')(decoded)



- Define similarity metrics

In [4]:
def cos_sim(a_b):
    '''
    a: batch x 120
    b: batch x 120 
    
    output: batch x 1
    '''
    a, b = a_b
    
    norm_a = K.sqrt(K.sum(a ** 2, axis=-1))
    norm_b = K.sqrt(K.sum(b ** 2, axis=-1))
    
    out = K.sum(a * b, axis=-1) / (norm_a * norm_b)
    out = K.reshape(out, [batch_size, 1])
    
    return out

def relative_euc_dist(a_b):
    '''
    a: batch x 120
    b: batch x 120 
    
    output: batch x 1
    '''
    a,b = a_b
    
    norm_diff = K.sqrt(K.sum((a - b)**2, axis=-1))
    norm_a = K.sqrt(K.sum(a ** 2, axis=-1))
    
    out = norm_diff / norm_a
    out = K.reshape(out, [batch_size, 1])

    return out

In [5]:
layer_cossim = Lambda(cos_sim,
                      name='cos_sim')([input_data, decoded])

In [6]:
layer_relativeEuc = Lambda(relative_euc_dist, 
                           name='relative_euc_dist')([input_data, decoded])

- Obtain "z"

In [7]:
def funct_concat(tensors):
    return K.concatenate(tensors)

layer_concat = Lambda(funct_concat, name="z")([layer_lowdim, layer_cossim, layer_relativeEuc])

### Define estimation network

In [8]:
input_est = layer_concat #Input(shape=(3,))(layer_concat)

est_layer = Dense(10, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(input_est)   ####(input_est)
est_layer = Dropout(0.5)(est_layer)
est_output = Dense(4, activation='softmax', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001), name='gamma')(est_layer)


### Build full network

In [9]:
full_network = Model(input=input_data, outputs=est_output)

  """Entry point for launching an IPython kernel.


In [10]:
full_network.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_placeholder (InputLayer)  (1024, 120)          0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (1024, 60)           7260        input_placeholder[0][0]          
__________________________________________________________________________________________________
dense_2 (Dense)                 (1024, 30)           1830        dense_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (1024, 10)           310         dense_2[0][0]                    
__________________________________________________________________________________________________
lowdim (De

### Load already saved dataset

In [11]:
import numpy as np

In [12]:
with np.load('kdd99_train_test.npz') as data: 
    x_train = data['x_train']     
    x_test = data['x_test']
    x_all = np.concatenate([x_train, x_test], axis=0)

In [13]:
x_all.shape

(494021, 120)

- First, get the split

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(x_all, test_size=0.5, random_state=42)

In [15]:
x_train.shape

(247010, 120)

In [16]:
x_test.shape

(247011, 120)

-  Then, get only the normal data in x_train for training (normal is anomaly for this dataset!)

In [17]:
x_train_normal = x_train[np.where(x_train[:,-1] == 1)]   ##last column is the normal column
x_train_anomaly = x_train[np.where(x_train[:,-1] == 0)]   

In [18]:
x_train_normal.shape

(48839, 120)

In [19]:
x_train_anomaly.shape   #### anomaly is more in this dataset (normal ones are treated as anomaly!!!)

(198171, 120)

In [20]:
x_train = x_train_anomaly

- Both x_train and x_test consist of "normal" values!!

In [21]:
x_train.shape

(198171, 120)

In [22]:
x_test.shape

(247011, 120)

- Assign arbitrary output

In [23]:
y_train = np.zeros((len(x_train),4))
y_train[:,0] = 1
y_train.shape

(198171, 4)

### GMM parameters

- k: number of clusters (4 for here)
- N: batch size
- d: Dimension of latent vector z (3 here)

- gamma ($\gamma$) : membership predictions (softmax output of estimation net) [$N \times K$]
- phi ($\phi$): gaussian probabilities [$K$]
- mu ($\mu$): gaussian means [$K \times d$]
- sigma ($\Sigma$): gaussian covariances [$K \times d \times d$]

In [24]:
k = 4 
N = batch_size 
d = int(layer_concat.get_shape()[1])

- GMM parameters init

In [25]:
phi = tf.get_variable("phi",
                      shape=(k),
                      dtype=tf.float32,
                      initializer=tf.zeros_initializer(),
                      trainable=False)

mu = tf.get_variable("mu",
                      shape=(k,d),
                      dtype=tf.float32,
                      initializer=tf.zeros_initializer(), 
                      trainable=False)

sigma_init = np.repeat([np.eye(d, dtype=np.float32)], k, axis=0)

sigma = tf.get_variable("sigma",
                      shape=(k,d,d),
                      dtype=tf.float32,
                      initializer=tf.constant_initializer(sigma_init),
                      trainable=False)

In [26]:
tf_sess = K.get_session()

In [27]:
tf_sess.run([phi.initializer,
             mu.initializer, 
             sigma.initializer])

[None, None, None]

- Check for uninit vars

In [28]:
print(tf_sess.run(tf.report_uninitialized_variables()))

[]


In [29]:
def printGmmParams():
    print("#### GMM params ####")
    print("phi:\n",K.eval(phi),"\n")
    print("mu:\n",K.eval(mu),"\n")
    print("sigma:\n", K.eval(sigma),"\n")



In [30]:
printGmmParams()

#### GMM params ####
phi:
 [0. 0. 0. 0.] 

mu:
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]] 

sigma:
 [[[1. 0. 0.]
  [0. 1. 0.]
  [0. 0. 1.]]

 [[1. 0. 0.]
  [0. 1. 0.]
  [0. 0. 1.]]

 [[1. 0. 0.]
  [0. 1. 0.]
  [0. 0. 1.]]

 [[1. 0. 0.]
  [0. 1. 0.]
  [0. 0. 1.]]] 



### Total loss calculation

In [31]:
isVerbose_en = True
isVerbose_loss = True

- For the first time run of the totalLoss function

In [32]:
choices = np.random.choice(len(x_train), size=batch_size, replace=False)

batch_x_train = x_train[choices]
batch_y_train = y_train[choices]

batch_train = (batch_x_train, batch_y_train)

- Define energy function

In [33]:
def computeEnergy(z_i):
    """
    compute E(z_i) in loss function
    """
    
    #inside_sum = 0
    inside_sum = tf.zeros(()) 
    for cluster in range(k):
        diff = tf.reshape(z_i - mu[cluster], (1,-1))   ### (1,3)
        diff_t = tf.reshape(diff, (-1,1)) #diff.reshape(-1,1)   ### (3,1)

        sigma_inv = tf.linalg.inv(sigma[cluster]) ### (3,3)
        
        exp_term = tf.exp(-0.5 * tf.matmul(diff, tf.matmul(sigma_inv, diff_t)))    ### (1,1)

        denom = tf.sqrt(tf.linalg.det(2 * np.pi * sigma[cluster]))
        
        inside_sum = phi[cluster] * (exp_term / denom)   ### (1,1)
        
        
    inside_sum = tf.reshape(inside_sum, ())
    sample_energy = -tf.log(inside_sum + 1e-6, name="sample_energy")
    
        
    ### flatten inside_sum and return log of it
    return sample_energy

- Define total loss

In [34]:
def totalLoss(yTrue, yPred):
    ### autoencoder loss
    autoenc_loss = tf.reduce_sum(((input_data - decoded)**2), axis=1)    ### (N,)
    autoenc_loss = tf.reduce_mean(autoenc_loss, axis=0)      #### mean over all N in batch
    
    ### obtain z and gamma for current batch
    z = layer_concat
    gamma = est_output
    
    
    ########### gmm update #################
    ###### phi #####
    result = tf.reduce_sum(gamma/batch_size, axis=0)
    
    update_phi = tf.assign(phi, 
                           result, 
                           name="update_phi")
    
    ######## mu ######
    gamma_sums = tf.reduce_sum(gamma, axis=0)
    gamma_sums_expand = tf.expand_dims(gamma_sums, axis=-1)
    result = tf.matmul(gamma, z, transpose_a=True) / gamma_sums_expand

    update_mu = tf.assign(mu, 
                          result,
                          name="update_mu")
    
    
    ####### sigma #####
    ### expand z (N, 1, d, 1)
    z_exp_dims = tf.expand_dims(z, 1)
    z_exp_dims = tf.expand_dims(z_exp_dims, -1)
    
    #with tf.control_dependencies([update_phi, update_mu]):
        ### expand phi (1,k,1,1)
    phi_exp_dims = tf.expand_dims(phi, axis=0)
    phi_exp_dims = tf.expand_dims(phi_exp_dims, axis=-1)
    phi_exp_dims = tf.expand_dims(phi_exp_dims, axis=-1)

    ### expand mu (1,k,d,1)
    mu_exp_dims = tf.expand_dims(mu, 0)
    mu_exp_dims = tf.expand_dims(mu_exp_dims, -1)

    ### diff (N, k, d, 1)
    diff = z_exp_dims - mu_exp_dims

    ### matmul of diffs (N, k, d, d)
    matmul = tf.matmul(diff, diff, transpose_b=True)

    ### expand gamma (N, k, 1 ,d)
    gamma_exp_dims = tf.expand_dims(gamma, axis=-1)
    gamma_exp_dims = tf.expand_dims(gamma_exp_dims, axis=-1)

    ### upper part
    nom = gamma_exp_dims * matmul   #### (N, k, d, d)
    nom = tf.reduce_sum(nom, axis=0)  #### (k,d,d)

    ### lower part
    gammasum = tf.reduce_sum(gamma, axis=0)     ### (k,)
    gammasum_exp_dims = tf.expand_dims(gammasum, axis=-1)   ### (k,1)
    gammasum_exp_dims = tf.expand_dims(gammasum_exp_dims, axis=-1)  ### (k,1,1)

    result = nom / gammasum_exp_dims ### (k, d, d)

    update_sigma = tf.assign(sigma, 
                             result, 
                             name ="update_sigma")
    
    
    ### dependency control
    with tf.control_dependencies([update_phi, update_mu, update_sigma]):  
        ### sample energy   
        sample_en_batch = tf.map_fn(lambda z_i: computeEnergy(z_i), z)
        sample_en = tf.reduce_mean(sample_en_batch, axis=0)
        sample_en *= lambda_1

        p = tf.reduce_sum(1 / tf.matrix_diag_part(sigma))
        p *= lambda_2

    ### total loss
    total_loss = autoenc_loss + sample_en + p

    return total_loss  

In [35]:
### keras

adam = optimizers.adam(lr=learning_rate, clipnorm=1., clipvalue=0.5) 

full_network.compile(optimizer=adam, loss=totalLoss) 

- Batch generator for training

In [36]:
### TODO: optimization??

def batchGenerator():
    '''
    return: number of batch_size examples in each run
    '''
    
    while True:
        choices = np.random.choice(len(x_train), size=batch_size, replace=False)
        
        batch_x_train = x_train[choices]
        batch_y_train = y_train[choices]
        
        
        yield (batch_x_train, batch_y_train) 

- Training 

In [37]:
steps_per_epoch = int(np.ceil(x_train.shape[0] / batch_size))

epochs = 1

history = full_network.fit_generator(batchGenerator(),
                           epochs = 1,
                           steps_per_epoch = steps_per_epoch,
                           verbose = 1)

Epoch 1/1


In [38]:
printGmmParams()

#### GMM params ####
phi:
 [0.2925424  0.3135683  0.11949927 0.27439004] 

mu:
 [[1.9726148  0.837193   0.42293307]
 [1.9396791  0.8274934  0.43452567]
 [1.86749    0.8068024  0.4579662 ]
 [1.9171326  0.82000035 0.44324836]] 

sigma:
 [[[ 0.61275196  0.18185823 -0.21331146]
  [ 0.18185823  0.05443549 -0.06400897]
  [-0.21331146 -0.06400897  0.07555761]]

 [[ 0.64550495  0.1913132  -0.22438732]
  [ 0.1913132   0.0572012  -0.06724969]
  [-0.22438732 -0.06724969  0.07940205]]

 [[ 0.7159855   0.21178122 -0.2478873 ]
  [ 0.21178122  0.06308342 -0.07396793]
  [-0.2478873  -0.07396793  0.08702039]]

 [[ 0.66284496  0.197244   -0.23129334]
  [ 0.197244    0.05939746 -0.06978946]
  [-0.23129334 -0.06978946  0.08235287]]] 



- Model save

In [39]:
full_network.save_weights("modelsave_weights-epochs{}.h5".format(epochs), overwrite=True)

In [40]:
full_network.save("modelsave-epochs{}.h5".format(epochs), overwrite=True)