# DaGMM with VAE, kdd99cup

In [1]:
from keras.layers import Input, Dense, Activation, Lambda, Dropout, Concatenate, Reshape
from keras.models import Model, Sequential
from keras.callbacks import TensorBoard
from keras import optimizers
from keras import backend as K

from keras.regularizers import l2


import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(0)
tf.set_random_seed(0)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


- ### Timestamp for experiments

In [2]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M")
print(timestamp, "")

2018-07-28_15:52 


In [3]:
### load dataset

class KDD99Dataset:
    def __init__(self, filename_train, filename_test):
        with np.load(filename_train) as data: 
            self.x_train = data["x_train"]
            self.y_train = data["y_train"]
            
        with np.load(filename_test) as data: 
            self.x_test = data["x_test"]
            #self.y_test = data["y_test"]
            
            
    def train_num_examples(self):
        return len(self.x_train)

    def train_next_batch(self,batch_size):
        choices = np.random.choice(len(self.x_train), size=batch_size, replace=False)

        batch_x_train = self.x_train[choices]
        batch_y_train = self.y_train[choices]

        batch_train = (batch_x_train, batch_y_train)

        return batch_train
    
    def test_next_batch(self,batch_size):
        choices = np.random.choice(len(self.x_test), size=batch_size, replace=False)

        batch_x_test = self.x_test[choices]
        #batch_y_train = y_train[choices]

        batch_train = (batch_x_test, None) #batch_y_train)

        return batch_train


In [4]:
kdd99 = KDD99Dataset("../datasets/kddcup/kdd99_train-randomState_None.npz", 
                     "../datasets/kddcup/kdd99_test-randomState_None.npz")

n_samples = kdd99.train_num_examples()

In [5]:
def xavier_init(fan_in, fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)

Based on this, we define now a class "VariationalAutoencoder" with a [sklearn](http://scikit-learn.org)-like interface that can be trained incrementally with mini-batches using partial_fit. The trained model can be used to reconstruct unseen input, to generate new samples, and to map inputs to the latent space.

In [6]:
class VariationalAutoencoder(object):
    """ Variation Autoencoder (VAE) with an sklearn-like interface implemented using TensorFlow.
    
    This implementation uses probabilistic encoders and decoders using Gaussian 
    distributions and  realized by multi-layer perceptrons. The VAE can be learned
    end-to-end.
    
    See "Auto-Encoding Variational Bayes" by Kingma and Welling for more details.
    """
    def __init__(self, network_architecture, 
                 transfer_fct=tf.nn.softplus, 
                 learning_rate=0.0001, 
                 batch_size=1024,
                 gmm_mixtures=4,  ###default for the kdd99, from paper
                 lambda_1 = 0.1,
                 lambda_2 = 0.005,
                ):
        
        
        self.network_architecture = network_architecture
        self.transfer_fct = transfer_fct
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.gmm_mixtures = gmm_mixtures
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        
        
        # tf Graph input
        self.x = tf.placeholder(tf.float32, [None, network_architecture["n_input"]])
        
        # Create autoencoder network
        self._create_network()
        
        ### init gmm parameters
        self.phi, self.mu, self.sigma = self._init_gmm()
        
        # Define loss function based variational upper-bound and 
        # corresponding optimizer
        ###self._create_loss_optimizer()
        self._total_loss()  ### new one
        
        # Initializing the tensor flow variables
        init = tf.global_variables_initializer()

        # Launch the session
        self.sess = tf.InteractiveSession()
        self.sess.run(init)
    
    def _create_network(self):
        # Initialize autoencode network weights and biases
        network_weights = self._initialize_weights(**self.network_architecture)

        # Use recognition network to determine mean and 
        # (log) variance of Gaussian distribution in latent
        # space
        self.z_mean, self.z_log_sigma_sq = \
            self._recognition_network(network_weights["weights_recog"], 
                                      network_weights["biases_recog"])

        # Draw one sample z from Gaussian distribution
        n_z = self.network_architecture["n_z"]
        eps = tf.random_normal((self.batch_size, n_z), 0, 1, 
                               dtype=tf.float32)
        # z = mu + sigma*epsilon
        self.z = tf.add(self.z_mean, 
                        tf.multiply(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps))

        # Use generator to determine mean of
        # Bernoulli distribution of reconstructed input
        self.x_reconstr_mean = \
            self._generator_network(network_weights["weights_gener"],
                                    network_weights["biases_gener"])
        
        ## cosine similarity
        self.cossim = self._layer_cossim(self.x, self.x_reconstr_mean)
        
        ## relative euc similarity
        self.relative_euc = self._layer_relative_eucdist(self.x, self.x_reconstr_mean)
        
        ## concat layers (z in original paper)
        self.concat = self._layer_concat(self.z, self.cossim, self.relative_euc)
        
        
        ##### estimation net
        input_est = self.concat 
        
        est_layer = Dense(10, activation='tanh', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))(input_est)   ####(input_est)
        est_layer = Dropout(0.5)(est_layer)
        self.est_output = Dense(self.gmm_mixtures, activation='softmax', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001), name='gamma')(est_layer)

        
    def _layer_cossim(self, input_layer, output_layer):
        '''
        a: batch x 120
        b: batch x 120 

        output: batch x 1
        '''
        a, b = input_layer, output_layer  #a_b
    
        norm_a = K.sqrt(K.sum(a ** 2, axis=-1))
        norm_b = K.sqrt(K.sum(b ** 2, axis=-1))
    
        out = K.sum(a * b, axis=-1) / (norm_a * norm_b)
        out = K.reshape(out, [self.batch_size, 1])
        
        return out
        
        
    def _layer_relative_eucdist(self, input_layer, output_layer):
        '''
        a: batch x 120
        b: batch x 120 

        output: batch x 1
        '''
        a, b = input_layer, output_layer  #a_b

        norm_diff = K.sqrt(K.sum((a - b)**2, axis=-1))
        norm_a = K.sqrt(K.sum(a ** 2, axis=-1))

        out = norm_diff / norm_a
        out = K.reshape(out, [self.batch_size, 1])

        return out
        
        
    def _layer_concat(self, lowdim, cossim, relative_euc):
        return K.concatenate([lowdim, cossim, relative_euc])
    
    
    def _init_gmm(self):
        k = self.gmm_mixtures
        self.d = d = int(self.concat.get_shape()[1])
        
        phi = tf.get_variable("phi",
                      shape=(k),
                      dtype=tf.float32,
                      initializer=tf.zeros_initializer(),
                      trainable=False)

        mu = tf.get_variable("mu",
                              shape=(k,d),
                              dtype=tf.float32,
                              initializer=tf.zeros_initializer(), 
                              trainable=False)

        sigma_init = np.repeat([np.eye(d, dtype=np.float32)], k, axis=0)

        sigma = tf.get_variable("sigma",
                              shape=(k,d,d),
                              dtype=tf.float32,
                              initializer=tf.constant_initializer(sigma_init),
                              trainable=False)
        
        return (phi, mu, sigma)
        
    
    def _compute_energy(self, z_i):
        """
        compute E(z_i) in loss function
        """

        #inside_sum = 0
        inside_sum = tf.zeros(()) 
        for cluster in range(self.gmm_mixtures):
            diff = tf.reshape(z_i - self.mu[cluster], (1,-1))   ### (1,3)
            diff_t = tf.reshape(diff, (-1,1)) #diff.reshape(-1,1)   ### (3,1)

            sigma_inv = tf.linalg.inv(self.sigma[cluster]) ### (3,3)

            exp_term = tf.exp(-0.5 * tf.matmul(diff, tf.matmul(sigma_inv, diff_t)))    ### (1,1)

            denom = tf.sqrt(tf.linalg.det(2 * np.pi * self.sigma[cluster]))

            inside_sum += self.phi[cluster] * (exp_term / denom)   ### (1,1)


        inside_sum = tf.reshape(inside_sum, ())
        sample_energy = -tf.log(inside_sum + 1e-6, name="sample_energy")

        ### flatten inside_sum and return log of it
        return sample_energy
    
    
    def save_weights(self, filename = "./" + timestamp + "-tfsave"):  
        ### save model weights
        saver = tf.train.Saver()
        saver.save(self.sess, filename)
        print("saved to:",filename)
        
        
        
    def load_weights(self, filename = "./" + timestamp + "-tfsave.meta"):
        ### load weights for test notebook
        saver = tf.train.import_meta_graph(filename)
        saver.restore(self.sess, tf.train.latest_checkpoint("./"))
    
            
    def _initialize_weights(self, n_hidden_recog_1, n_hidden_recog_2, n_hidden_recog_3, n_hidden_recog_4,
                            n_hidden_gener_1,  n_hidden_gener_2, n_hidden_gener_3, n_hidden_gener_4,
                            n_input, n_z):
        all_weights = dict()
        all_weights['weights_recog'] = {
            'h1': tf.Variable(xavier_init(n_input, n_hidden_recog_1)),
            'h2': tf.Variable(xavier_init(n_hidden_recog_1, n_hidden_recog_2)),
            'h3': tf.Variable(xavier_init(n_hidden_recog_2, n_hidden_recog_3)),
            'h4': tf.Variable(xavier_init(n_hidden_recog_3, n_hidden_recog_4)),
            'out_mean': tf.Variable(xavier_init(n_hidden_recog_4, n_z)),
            'out_log_sigma': tf.Variable(xavier_init(n_hidden_recog_4, n_z))}
        all_weights['biases_recog'] = {
            'b1': tf.Variable(tf.zeros([n_hidden_recog_1], dtype=tf.float32)),
            'b2': tf.Variable(tf.zeros([n_hidden_recog_2], dtype=tf.float32)),
            'b3': tf.Variable(tf.zeros([n_hidden_recog_3], dtype=tf.float32)),
            'b4': tf.Variable(tf.zeros([n_hidden_recog_4], dtype=tf.float32)),
            'out_mean': tf.Variable(tf.zeros([n_z], dtype=tf.float32)),
            'out_log_sigma': tf.Variable(tf.zeros([n_z], dtype=tf.float32))}
        all_weights['weights_gener'] = {
            'h1': tf.Variable(xavier_init(n_z, n_hidden_gener_1)),
            'h2': tf.Variable(xavier_init(n_hidden_gener_1, n_hidden_gener_2)),
            'h3': tf.Variable(xavier_init(n_hidden_gener_2, n_hidden_gener_3)),
            'h4': tf.Variable(xavier_init(n_hidden_gener_3, n_hidden_gener_4)),
            'out_mean': tf.Variable(xavier_init(n_hidden_gener_4, n_input)),
            'out_log_sigma': tf.Variable(xavier_init(n_hidden_gener_4, n_input))}
        all_weights['biases_gener'] = {
            'b1': tf.Variable(tf.zeros([n_hidden_gener_1], dtype=tf.float32)),
            'b2': tf.Variable(tf.zeros([n_hidden_gener_2], dtype=tf.float32)),
            'b3': tf.Variable(tf.zeros([n_hidden_gener_3], dtype=tf.float32)),
            'b4': tf.Variable(tf.zeros([n_hidden_gener_4], dtype=tf.float32)),
            'out_mean': tf.Variable(tf.zeros([n_input], dtype=tf.float32)),
            'out_log_sigma': tf.Variable(tf.zeros([n_input], dtype=tf.float32))}
        return all_weights
            
    def _recognition_network(self, weights, biases):
        # Generate probabilistic encoder (recognition network), which
        # maps inputs onto a normal distribution in latent space.
        # The transformation is parametrized and can be learned.
        
        layer_1 = self.transfer_fct(tf.add(tf.matmul(self.x, weights['h1']), 
                                           biases['b1'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_1)
        layer_2 = self.transfer_fct(tf.add(tf.matmul(batch_normed, weights['h2']), 
                                           biases['b2'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_2)
        layer_3 = self.transfer_fct(tf.add(tf.matmul(batch_normed, weights['h3']), 
                                           biases['b3'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_3)
        layer_4 = self.transfer_fct(tf.add(tf.matmul(batch_normed, weights['h4']), 
                                           biases['b4'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_4)
        z_mean = tf.add(tf.matmul(batch_normed, weights['out_mean']),
                        biases['out_mean'])
        z_log_sigma_sq = \
            tf.add(tf.matmul(batch_normed, weights['out_log_sigma']), 
                   biases['out_log_sigma'])
        return (z_mean, z_log_sigma_sq)

    def _generator_network(self, weights, biases):
        # Generate probabilistic decoder (decoder network), which
        # maps points in latent space onto a Bernoulli distribution in data space.
        # The transformation is parametrized and can be learned.
        layer_1 = self.transfer_fct(tf.add(tf.matmul(self.z, weights['h1']), 
                                           biases['b1'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_1)
        layer_2 = self.transfer_fct(tf.add(tf.matmul(batch_normed, weights['h2']), 
                                           biases['b2'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_2)
        layer_3 = self.transfer_fct(tf.add(tf.matmul(batch_normed, weights['h3']), 
                                           biases['b3'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_3)
        layer_4 = self.transfer_fct(tf.add(tf.matmul(batch_normed, weights['h4']), 
                                           biases['b4'])) 
        batch_normed = tf.keras.layers.BatchNormalization()(layer_4)
        x_reconstr_mean = \
            tf.nn.sigmoid(tf.add(tf.matmul(batch_normed, weights['out_mean']), 
                                 biases['out_mean']))
        return x_reconstr_mean

        
    def _update_gmm(self):
        gamma = self.est_output ### output of estimation net (batch size x 4)
        z = self.concat ### concat output (batch size x 3)
        
        
        ### get parameters
        self.updates_gmm = []
        phi = self.phi
        mu = self.mu
        sigma = self.sigma
        k = self.gmm_mixtures
        d = self.d
        N = self.batch_size

        ################### phi #################
        update_phi = tf.assign(phi, 
                               tf.reduce_sum(gamma/self.batch_size, axis=0), 
                               name="update_phi")
        self.updates_gmm.append(update_phi)
        #######################################

        ################## mu ################
        for cluster in range(k):
            ### get the corresponding column of predictions
            gamma_cluster = tf.reshape(gamma[:,cluster], (-1,1))    ### (N x 1)

            ### duplicate column d times
            gamma_cluster_tile = tf.tile(gamma_cluster, (1,d))   ### (N x d)

            ### sum over all batch and divide
            matmul = tf.matmul(z, gamma_cluster_tile, transpose_a=True)   ### (dxd)

            result = tf.reduce_sum(matmul, axis=0, name="mu_matmul_red") / tf.reduce_sum(gamma[:,cluster], axis = 0, name="mu_gamma_red")

            update_mu = tf.assign(mu[cluster], 
                                  result,
                                  name="update_mu"+str(cluster))
            self.updates_gmm.append(update_mu)
        #########################################

        ################ sigma ###############
        for cluster in range(k):
            ### expand gamma for each sample
            gamma_cluster = tf.reshape(gamma[:,cluster], (-1,1))    ### (N x 1)
            gamma_cluster_expand = tf.expand_dims(gamma_cluster, 1) ### (N x 1 x 1)


            #### calculating diff
            ## expand mu and z
            ######## TODO: race condition??? (get mu after its updated value (auto??))
            with tf.control_dependencies(self.updates_gmm):
                mu_cluster = tf.reshape(mu[cluster], (1,-1))   ### (1 x d)

            mu_expand = tf.expand_dims(mu_cluster, 1)      ### (1 x 1 x 3)
            mu_expand_tile = tf.tile(mu_expand, tf.stack([N, 1, 1]))   ### (N x 1 x d)

            z_expand = tf.expand_dims(z, 1)    ### (N x 1 x d)

            diff = z_expand - mu_expand_tile   ### (N x 1 x d)

            ### matmul in the upper part
            matmul = tf.matmul(diff, diff, transpose_a=True)  ### (N x d x d)


            ### nominator
            nom = gamma_cluster_expand * matmul   ### (N x d x d)
            nom_reduced = tf.reduce_sum(nom, axis=0) ### (d x d)


            ### denominator
            denom = tf.reduce_sum(gamma_cluster)  ### single value, zero dim

            update_sigma = tf.assign(sigma[cluster], 
                                     nom_reduced/denom, 
                                     name ="update_sigma"+str(cluster))
            self.updates_gmm.append(update_sigma)
            
            
             
    def _create_loss_optimizer(self):
        # The loss is composed of two terms:
        # 1.) The reconstruction loss (the negative log probability
        #     of the input under the reconstructed Bernoulli distribution 
        #     induced by the decoder in the data space).
        #     This can be interpreted as the number of "nats" required
        #     for reconstructing the input when the activation in latent
        #     is given.
        # Adding 1e-10 to avoid evaluation of log(0.0)
        reconstr_loss = \
            -tf.reduce_sum(self.x * tf.log(1e-6 + self.x_reconstr_mean)
                           + (1-self.x) * tf.log(1e-6 + 1 - self.x_reconstr_mean),
                           1)
        # 2.) The latent loss, which is defined as the Kullback Leibler divergence 
        ##    between the distribution in latent space induced by the encoder on 
        #     the data and some prior. This acts as a kind of regularizer.
        #     This can be interpreted as the number of "nats" required
        #     for transmitting the the latent space distribution given
        #     the prior.
        latent_loss = -0.5 * tf.reduce_sum(1 + self.z_log_sigma_sq 
                                           - tf.square(self.z_mean) 
                                           - tf.exp(self.z_log_sigma_sq), 1)
        self.cost = tf.reduce_mean(reconstr_loss + latent_loss)   # average over batch
        # Use ADAM optimizer
        self.optimizer = \
            tf.train.AdagradOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
               
        
    def _total_loss(self):
        ### reconstr loss + latent loss + energy + reg term for sigma
        
        ### reconstr loss
        reconstr_loss = \
            -tf.reduce_sum(self.x * tf.log(1e-6 + self.x_reconstr_mean)
                           + (1-self.x) * tf.log(1e-6 + 1 - self.x_reconstr_mean),
                           1)

        ### latent loss
        latent_loss = -0.5 * tf.reduce_sum(1 + self.z_log_sigma_sq 
                                           - tf.square(self.z_mean) 
                                           - tf.exp(self.z_log_sigma_sq), 1)
        
        ### update gmm params for energy
        self._update_gmm()
        
        
        ### energy
        ### dependency control
        z = self.concat
        
        with tf.control_dependencies(self.updates_gmm):  
            ### sample energy   
            sample_en_batch = tf.map_fn(lambda z_i: self._compute_energy(z_i), z)
            sample_en = tf.reduce_mean(sample_en_batch, axis=0)
            sample_en *= self.lambda_1

            ### diagonal regularization
            p = tf.reduce_sum(1 / tf.matrix_diag_part(self.sigma))
            p *= self.lambda_2
            
        ### vae loss instead of ae loss    
        vae_loss = tf.reduce_mean(reconstr_loss + latent_loss)
        
        ### total loss
        self.cost = vae_loss + sample_en + p
        
         # Use ADAM optimizer
        self.optimizer = \
            tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
    
    def saveGmmParams(self, filename = timestamp + "-gmmParams.npz"):
        phi_ = K.eval(self.phi)
        mu_ = K.eval(self.mu)
        sigma_ = K.eval(self.sigma)
        np.savez_compressed(filename, phi=phi_, mu=mu_, sigma=sigma_)
        print("saved to:", filename)
        
    def printGmmParams(self):
        print("#### GMM params ####")
        print("phi:\n",K.eval(self.phi),"\n")
        print("mu:\n",K.eval(self.mu),"\n")
        print("sigma:\n", K.eval(self.sigma),"\n")
        

    def partial_fit(self, X):
        """Train model based on mini-batch of input data.
        
        Return cost of mini-batch.
        """
        opt, cost = self.sess.run((self.optimizer, self.cost), 
                                  feed_dict={self.x: X})
        return cost
    
    def transform(self, X):
        """Transform data by mapping it into the latent space."""
        # Note: This maps to mean of distribution, we could alternatively
        # sample from Gaussian distribution
        mu = self.sess.run(self.z_mean, feed_dict={self.x: X})
        sigma = self.sess.run(self.z_log_sigma_sq, feed_dict={self.x: X}) ###!!! (doğru mu/gerekli mi??)
        return mu, sigma
    
    def generate(self, z_mu=None):
        """ Generate data by sampling from latent space.
        
        If z_mu is not None, data for this point in latent space is
        generated. Otherwise, z_mu is drawn from prior in latent 
        space.        
        """
        if z_mu is None:
            z_mu = np.random.normal(size=self.network_architecture["n_z"])
        # Note: This maps to mean of distribution, we could alternatively
        # sample from Gaussian distribution
        return self.sess.run(self.x_reconstr_mean, 
                             feed_dict={self.z: z_mu})
    
    def reconstruct(self, X):
        """ Use VAE to reconstruct given data. """
        return self.sess.run(self.x_reconstr_mean, 
                             feed_dict={self.x: X})

In general, implementing a VAE in tensorflow is relatively straightforward (in particular since we don not need to code the gradient computation). A bit confusing is potentially that all the logic happens at initialization of the class (where the graph is generated), while the actual sklearn interface methods are very simple one-liners.

We can now define a simple fuction which trains the VAE using mini-batches:

In [None]:
import time

def train(network_architecture, 
          learning_rate=0.0001,
          batch_size=1024, 
          training_epochs=70, 
          display_step=1):
    
    vae = VariationalAutoencoder(network_architecture, 
                                 learning_rate=learning_rate, 
                                 batch_size=batch_size)
    history_loss = dict()
    

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(n_samples / batch_size)
        # Loop over all batches
        
        loss_batch = []
        
        start = time.time()
        for i in range(total_batch):
            batch_xs, _ = kdd99.train_next_batch(batch_size) #mnist.train.next_batch(batch_size)

            # Fit training using batch data
            cost = vae.partial_fit(batch_xs)
            
            loss_batch.append(cost)
            
            print(" batch:",i,"cost:",cost)
            # Compute average loss
            avg_cost += cost / n_samples * batch_size

        end = time.time()
        
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("--- end of epoch:", '%04d' % (epoch+1), 
                  "avg cost=", "{:.9f}".format(avg_cost),
                  "time=", "{:.3f} seconds".format(end-start), )
            
        ### append loss to history
        history_loss[str(epoch)] = loss_batch
        history_loss[str(epoch)+"-avg"] = avg_cost
            
    return vae, history_loss

### Train network

In [None]:

network_architecture = \
    dict(n_hidden_recog_1=60, # 1st layer encoder neurons
         n_hidden_recog_2=40, # 2nd layer encoder neurons
         n_hidden_recog_3=20, # 2nd layer encoder neurons
         n_hidden_recog_4=10, # 2nd layer encoder neurons
         n_hidden_gener_1=10, # 1st layer decoder neurons
         n_hidden_gener_2=20, # 2nd layer decoder neurons
         n_hidden_gener_3=40, # 1st layer decoder neurons
         n_hidden_gener_4=60, # 2nd layer decoder neurons
         n_input=120,  # kdd99 data input dimension
         n_z=5,
        )  # dimensionality of latent space

vae, history_loss = train(network_architecture, 
            training_epochs=60, 
            display_step=1, 
            batch_size=1024)

 batch: 0 cost: 90.45799
 batch: 1 cost: 90.012085
 batch: 2 cost: 89.63262
 batch: 3 cost: 89.30731
 batch: 4 cost: 88.82792
 batch: 5 cost: 88.530685
 batch: 6 cost: 88.06156
 batch: 7 cost: 87.684555
 batch: 8 cost: 87.3233
 batch: 9 cost: 86.86218
 batch: 10 cost: 86.402275
 batch: 11 cost: 86.145096
 batch: 12 cost: 85.795525
 batch: 13 cost: 85.37705
 batch: 14 cost: 85.02667
 batch: 15 cost: 84.537735
 batch: 16 cost: 84.32357
 batch: 17 cost: 83.7925
 batch: 18 cost: 83.40101
 batch: 19 cost: 83.300255
 batch: 20 cost: 82.77223
 batch: 21 cost: 82.53487
 batch: 22 cost: 82.29291
 batch: 23 cost: 81.769554
 batch: 24 cost: 81.46723
 batch: 25 cost: 81.16209
 batch: 26 cost: 80.8815
 batch: 27 cost: 80.473785
 batch: 28 cost: 80.079155
 batch: 29 cost: 79.88667
 batch: 30 cost: 79.44522
 batch: 31 cost: 79.2078
 batch: 32 cost: 78.709274
 batch: 33 cost: 78.49084
 batch: 34 cost: 78.2037
 batch: 35 cost: 78.03698
 batch: 36 cost: 77.64468
 batch: 37 cost: 77.228676
 batch: 38 cos

 batch: 111 cost: 21.385723
 batch: 112 cost: 21.093655
 batch: 113 cost: 20.422256
 batch: 114 cost: 20.926023
 batch: 115 cost: 20.885874
 batch: 116 cost: 20.530327
 batch: 117 cost: 20.181269
 batch: 118 cost: 20.62169
 batch: 119 cost: 19.910236
 batch: 120 cost: 20.152828
 batch: 121 cost: 20.390484
 batch: 122 cost: 20.444794
 batch: 123 cost: 19.330084
 batch: 124 cost: 19.915937
 batch: 125 cost: 19.914839
 batch: 126 cost: 20.095402
 batch: 127 cost: 19.19684
 batch: 128 cost: 19.21028
 batch: 129 cost: 19.697714
 batch: 130 cost: 19.743357
 batch: 131 cost: 18.793356
 batch: 132 cost: 19.303099
 batch: 133 cost: 19.280884
 batch: 134 cost: 19.204954
 batch: 135 cost: 19.226318
 batch: 136 cost: 18.727146
 batch: 137 cost: 18.890072
 batch: 138 cost: 18.800713
 batch: 139 cost: 18.718729
 batch: 140 cost: 18.802853
 batch: 141 cost: 18.722818
 batch: 142 cost: 18.595583
 batch: 143 cost: 18.49512
 batch: 144 cost: 18.105967
 batch: 145 cost: 18.514032
 batch: 146 cost: 18.349

 batch: 19 cost: 12.218445
 batch: 20 cost: 11.840199
 batch: 21 cost: 12.490992
 batch: 22 cost: 12.502432
 batch: 23 cost: 12.42822
 batch: 24 cost: 12.230071
 batch: 25 cost: 12.766385
 batch: 26 cost: 11.95599
 batch: 27 cost: 12.441864
 batch: 28 cost: 12.924513
 batch: 29 cost: 12.474638
 batch: 30 cost: 12.19101
 batch: 31 cost: 12.318943
 batch: 32 cost: 12.25634
 batch: 33 cost: 12.47279
 batch: 34 cost: 12.066614
 batch: 35 cost: 12.326057
 batch: 36 cost: 12.718317
 batch: 37 cost: 12.854481
 batch: 38 cost: 12.399229
 batch: 39 cost: 12.712592
 batch: 40 cost: 12.36049
 batch: 41 cost: 12.473807
 batch: 42 cost: 12.114481
 batch: 43 cost: 12.269175
 batch: 44 cost: 12.322814
 batch: 45 cost: 12.490612
 batch: 46 cost: 12.38457
 batch: 47 cost: 12.58627
 batch: 48 cost: 12.2483635
 batch: 49 cost: 12.598011
 batch: 50 cost: 12.040541
 batch: 51 cost: 12.2123375
 batch: 52 cost: 12.418917
 batch: 53 cost: 12.521861
 batch: 54 cost: 12.736834
 batch: 55 cost: 12.960331
 batch:

### Save network params

In [None]:
vae.save_weights()

In [None]:
vae.load_weights(filename = "./" + timestamp + "-tfsave.meta")

### Save loss history

In [None]:
filename = timestamp + "_history"
np.savez_compressed(filename, history_loss=history_loss)

### Save gmm params

In [None]:
vae.saveGmmParams()