In [73]:
import math

import numpy as np
import keras
import sys, time
from keras.callbacks import CSVLogger
import matplotlib.pyplot as plt
from tensorflow import set_random_seed
set_random_seed(1234)
np.random.seed(1234)

def get_mnist_data(num_samples=1000):
            
    # load data
    (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
    n_inputs, height, max_length = X_train.shape
    encoder_input_data = np.zeros((n_inputs, max_length + 1, height), dtype="float32")
    decoder_input_data = np.zeros((n_inputs, max_length + 1, height), dtype="float32")
    encoder_input_data[:,:max_length,:] = np.swapaxes(X_train, 1, 2).copy()/255
    decoder_input_data[:,1:,:] = np.swapaxes(X_train, 1, 2).copy()/255

    return max_length + 1, height, encoder_input_data[:num_samples,:,:], decoder_input_data[:num_samples,:,:]


In [82]:
fit_with(100,100)

(1000, 29, 28) Creating model...
Training model...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[0.4196508240699768, 0.39225774812698366, 0.3743474795818329, 0.3631816101074219]


0.1375723991394043

In [90]:
def fit_with(intermediate_dim, latent_dim, prints=False):
    num_samples = 500
    timesteps_max, enc_tokens, x, x_decoder = get_mnist_data(num_samples=num_samples)

    print(x.shape, "Creating model...")

    input_dim = x.shape[-1]
    timesteps = x.shape[-2]
    batch_size = 1
    #latent_dim = 191
    #intermediate_dim = 353
    latent_dim = math.ceil(latent_dim)
    intermediate_dim = math.ceil(intermediate_dim)
    epochs = 1

    vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                             batch_size=batch_size,
                                             intermediate_dim=intermediate_dim,
                                             latent_dim=latent_dim)
    print("Training model...")

    hist = vae.fit([x, x_decoder], x, epochs=epochs, verbose=1)
    
    
    if not prints:
        return -hist.history['loss'][-1]
    else:
        
        def decode(s, start_char = "\t"):
            return decode_sequence(s, gen, stepper, input_dim, timesteps_max)
    
        for _ in range(5):

            id_from = np.random.randint(0, x.shape[0] - 1)
            id_to = np.random.randint(0, x.shape[0] - 1)

            m_from, std_from = enc.predict([[x[id_from]]])
            m_to, std_to = enc.predict([[x[id_to]]])

            seq_from = np.random.normal(size=(latent_dim,))
            seq_from = m_from #+ std_from * seq_from

            seq_to = np.random.normal(size=(latent_dim,))
            seq_to = m_to #+ std_to * seq_to



            print("== from \t ==")
            plt.imshow(x[id_from].T, cmap='Greys',  interpolation='nearest')
            plt.grid(False)
            plt.show()

            for v in np.linspace(0, 1, 7):
                print("%.2f\t" % (1 - v))
                plt.imshow(decode(v * seq_to + (1 - v) * seq_from).T, cmap='Greys',  interpolation='nearest')
                plt.grid(False)
                plt.show()
                
from functools import partial

verbose = 1
fit_with_partial = partial(fit_with)

In [89]:
from bayes_opt import BayesianOptimization

# Bounded region of parameter space
pbounds = {'intermediate_dim':(100,5000), 'latent_dim':(100,5000)}

optimizer = BayesianOptimization(
    f=fit_with_partial,
    pbounds=pbounds,
    verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

optimizer.maximize(init_points=50, n_iter=50,)


for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))

print(optimizer.max)

|   iter    |  target   | interm... | latent... |
-------------------------------------------------
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 1       [0m | [0m-0.3699  [0m | [0m 1.242e+0[0m | [0m 1.636e+0[0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 2       [0m | [0m-0.3707  [0m | [0m 700.1   [0m | [0m 1.093e+0[0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 3       [0m | [0m-0.3701  [0m | [0m 890.8   [0m | [0m 820.0   [0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 4       [0m | [0m-0.3718  [0m | [0m 942.1   [0m | [0m 1.149e+0[0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 5       [0m | [0m-0.3743  [0m | [0m 1.216e+0[0m | [0m 1.4e+03 [0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 6  

Epoch 2/3
Epoch 3/3
| [0m 17      [0m | [0m-0.3751  [0m | [0m 706.7   [0m | [0m 703.0   [0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 18      [0m | [0m-0.3733  [0m | [0m 1.997e+0[0m | [0m 1.994e+0[0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 19      [0m | [0m-0.3726  [0m | [0m 1.533e+0[0m | [0m 2e+03   [0m |
(500, 29, 28) Creating model...
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
| [0m 20      [0m | [0m-0.3711  [0m | [0m 2e+03   [0m | [0m 1.999e+0[0m |
Iteration 0: 
	{'target': -0.3698778395652771, 'params': {'intermediate_dim': 1242.1286061133462, 'latent_dim': 1636.4218414748057}}
Iteration 1: 
	{'target': -0.3707469313144684, 'params': {'intermediate_dim': 700.1486872625484, 'latent_dim': 1093.0323444213918}}
Iteration 2: 
	{'target': -0.37009035181999206, 'params': {'intermediate_dim': 890.7826580622469, 'latent_dim': 820.0401731994372}}
Iteration 3: 
	

In [67]:
# coding: utf-8

from keras import backend as K
from keras import objectives
from keras.layers import Input, LSTM
from keras.layers.core import Dense, Lambda
from keras.layers.wrappers import TimeDistributed
from keras.models import Model
from keras.utils.generic_utils import get_custom_objects

def create_lstm_vae(input_dim,
                    batch_size,  # we need it for sampling
                    intermediate_dim,
                    latent_dim):
    """
    Creates an LSTM Variational Autoencoder (VAE).

    # Arguments
        input_dim: int.
        batch_size: int.
        intermediate_dim: int, output shape of LSTM.
        latent_dim: int, latent z-layer shape.
        epsilon_std: float, z-layer sigma.


    # References
        - [Building Autoencoders in Keras](https://blog.keras.io/building-autoencoders-in-keras.html)
        - [Generating sentences from a continuous space](https://arxiv.org/abs/1511.06349)
    """
    x = Input(shape=(None, input_dim,))

    # LSTM encoding
    h = LSTM(units=intermediate_dim)(x)

    # VAE Z layer
    z_mean = Dense(units=latent_dim)(h)
    z_log_sigma = Dense(units=latent_dim)(h)

    def sampling(args):
        z_mean, z_log_sigma = args
        epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0., stddev=1.0)
        return z_mean + z_log_sigma * epsilon

    # note that "output_shape" isn't necessary with the TensorFlow backend
    # so you could write `Lambda(sampling)([z_mean, z_log_sigma])`
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])

    z_reweighting = Dense(units=intermediate_dim, activation="linear")
    z_reweighted = z_reweighting(z)

    # "next-word" data for prediction
    decoder_words_input = Input(shape=(None, input_dim,))

    # decoded LSTM layer
    decoder_h = LSTM(intermediate_dim, return_sequences=True, return_state=True)

    # todo: not sure if this initialization is correct
    h_decoded, _, _ = decoder_h(decoder_words_input, initial_state=[z_reweighted, z_reweighted])
    decoder_dense = TimeDistributed(Dense(input_dim, activation="softmax"))
    decoded_onehot = decoder_dense(h_decoded)

    # end-to-end autoencoder
    vae = Model([x, decoder_words_input], decoded_onehot)

    # encoder, from inputs to latent space
    encoder = Model(x, [z_mean, z_log_sigma])

    # generator, from latent space to reconstructed inputs -- for inference's first step
    decoder_state_input = Input(shape=(latent_dim,))
    _z_rewighted = z_reweighting(decoder_state_input)
    _h_decoded, _decoded_h, _decoded_c = decoder_h(decoder_words_input, initial_state=[_z_rewighted, _z_rewighted])
    _decoded_onehot = decoder_dense(_h_decoded)
    generator = Model([decoder_words_input, decoder_state_input], [_decoded_onehot, _decoded_h, _decoded_c])

    # RNN for inference
    input_h = Input(shape=(intermediate_dim,))
    input_c = Input(shape=(intermediate_dim,))
    __h_decoded, __decoded_h, __decoded_c = decoder_h(decoder_words_input, initial_state=[input_h, input_c])
    __decoded_onehot = decoder_dense(__h_decoded)
    stepper = Model([decoder_words_input, input_h, input_c], [__decoded_onehot, __decoded_h, __decoded_c])

    def vae_loss(x, x_decoded_onehot):
        xent_loss = objectives.categorical_crossentropy(x, x_decoded_onehot)
        kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
        loss = xent_loss + kl_loss
        return loss
    
    def xent_loss(x, x_decoded_onehot):
        xent_loss = objectives.categorical_crossentropy(x, x_decoded_onehot)
        return xent_loss
    

    def kl_loss(x, x_decoded_onehot):
        kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
        return kl_loss
    
    def bc_loss(x, x_decoded_onehot):
        kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
        bc_loss = objectives.binary_crossentropy(x, x_decoded_onehot)
        return bc_loss+kl_loss
    
    get_custom_objects().update({"bc_loss": bc_loss, 'xent_loss': xent_loss, 'kl_loss':kl_loss})

    vae.compile(optimizer="adam", loss=bc_loss, metrics = [bc_loss, kl_loss])
    #vae.summary()

    return vae, encoder, generator, stepper



In [33]:
import numpy as np



def decode_sequence(states_value, decoder_adapter_model, rnn_decoder_model, num_decoder_tokens, max_seq_length):
    """
    Decoding adapted from this example:
    https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
    :param states_value:
    :param decoder_adapter_model: reads text representation, makes the first prediction, yields states after the first RNN's step
    :param rnn_decoder_model: reads previous states and makes one RNN step
    :param num_decoder_tokens:
    :param token2id: dict mapping words to ids
    :param id2token: dict mapping ids to words
    :param max_seq_length: the maximum length of the sequence
    :return:
    """

    # generate empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))

    # sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1)
    stop_condition = False

    decoded_sentence = np.zeros((max_seq_length, num_decoder_tokens))

    first_time = True
    h, c = None, None
        
    t = 0
    while not stop_condition:

        if first_time:
            # feeding in states sampled with the mean and std provided by encoder
            # and getting current LSTM states to feed in to the decoder at the next step
            output_tokens, h, c = decoder_adapter_model.predict([target_seq, states_value])
            first_time = False
        else:
            # reading output token
            output_tokens, h, c = rnn_decoder_model.predict([target_seq, h, c])

        # sample a token
       
        decoded_sentence[t,:] = output_tokens.copy()

        # exit condition: either hit max length
        # or find stop character.
        if t >= max_seq_length - 1:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = output_tokens
        t += 1

    return decoded_sentence