# convolutional variational autoencoder
- generate a bunch of new ones
- interpolate between emojis

In [3]:
import keras.backend as K
from keras.layers import Layer as KerasLayer
from keras.losses import mean_squared_error
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.utils.vis_utils import model_to_dot

In [None]:
class VariationalLayer(KerasLayer):

    def __init__(self, output_dim: int, epsilon_std=1.):
        '''A custom "variational" Keras layer that completes the
        variational autoencoder.

        Args:
            output_dim : The desired number of latent dimensions in our
                embedding space.
        '''
        self.output_dim = output_dim
        self.epsilon_std = epsilon_std
        super().__init__()

    def build(self, input_shape):
        self.z_mean_weights = self.add_weight(
            shape=(input_shape[1], self.output_dim),
            initializer='glorot_normal',
            trainable=True
        )
        self.z_mean_bias = self.add_weight(
            shape=(self.output_dim,),
            initializer='zero',
            trainable=True,
        )
        self.z_log_var_weights = self.add_weight(
            shape=(input_shape[1], self.output_dim),
            initializer='glorot_normal',
            trainable=True
        )
        self.z_log_var_bias = self.add_weight(
            shape=(self.output_dim,),
            initializer='zero',
            trainable=True
        )
        super().build(input_shape)

    def call(self, x):
        z_mean = K.dot(x, self.z_mean_weights) + self.z_mean_bias
        z_log_var = K.dot(x, self.z_log_var_weights) + self.z_log_var_bias
        epsilon = K.random_normal(
            shape=K.shape(z_log_var),
            mean=0.,
            stddev=self.epsilon_std
        )

        kl_loss_numerator = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        self.kl_loss = -0.5 * K.sum(kl_loss_numerator, axis=-1)
        return z_mean + K.exp(z_log_var / 2) * epsilon

    def loss(self, x, x_decoded):
        return mean_squared_error(x, x_decoded) + self.kl_loss

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)

In [None]:
class VariationalAutoEncoderEmbeddingModel(BaseEmbeddingModel):
    
    def __init__(self, embedding_size: int, dense_layer_size: int, λ: float):
        '''Initializes the model parameters.
        
        Args:
            embedding_size : The desired number of latent dimensions in our 
                embedding space.
            λ : The regularization strength to apply to the model's
                dense layers.
        '''
        self.embedding_size = embedding_size
        self.dense_layer_size = dense_layer_size
        self.λ = λ
        self.n_unique_airports = n_unique_airports
        self.variational_layer = VariationalLayer(embedding_size)
        self.model = self._build_model()
        
    def _build_model(self):
        # encoder
        origin = Input(shape=(self.n_unique_airports,), name='origin')
        origin_geo = Input(shape=(2,), name='origin_geo')
        dense = concatenate([origin, origin_geo])
        dense = Dense(self.dense_layer_size, activation='tanh', kernel_regularizer=l2(self.λ))(dense)
        dense = BatchNormalization()(dense)
        variational_output = self.variational_layer(dense)

        encoder = Model([origin, origin_geo], variational_output, name='encoder')

        # decoder
        latent_vars = Input(shape=(self.embedding_size,))
        dense = Dense(self.dense_layer_size, activation='tanh', kernel_regularizer=l2(self.λ))(latent_vars)
        dense = Dense(self.dense_layer_size, activation='tanh', kernel_regularizer=l2(self.λ))(dense)
        dense = BatchNormalization()(dense)
        dest = Dense(self.n_unique_airports, activation='softmax', name='dest', kernel_regularizer=l2(self.λ))(dense)
        dest_geo = Dense(2, activation='linear', name='dest_geo')(dense)

        decoder = Model(latent_vars, [dest, dest_geo], name='decoder')

        # end-to-end
        encoder_decoder = Model([origin, origin_geo], decoder(encoder([origin, origin_geo])))
        return encoder_decoder

In [None]:
vae_model = VariationalAutoEncoderEmbeddingModel(embedding_size=1, dense_layer_size=20, λ=.003)
vae_model.compile(optimizer=Adam(lr=LEARNING_RATE), loss=[vae_model.variational_layer.loss, 'mean_squared_logarithmic_error'], 
                  loss_weights=[1., .2])
SVG(model_to_dot(vae_model.model).create(prog='dot', format='svg'))

# question-answer models for emoji responses
- when predicting, you can predict onto the generated emojis as well!
- add some dropout
- vanishing/exploding gradients
- spaCy for word2vec embeddings?

## bi-directional lstms
- "Bidirectional Long Short-Term Memory (biLSTM): Single direction LSTMs suffer a weakness
of not utilizing the contextual information from the future tokens. Bidirectional LSTM utilizes both
the previous and future context by processing the sequence on two directions, and generate two
independent sequences of LSTM output vectors. One processes the input sequence in the forward
direction, while the other processes the input in the reverse direction. The output at each time step
is the concatenation of the two output vectors from both directions, ie. ht =
−→ht k
←−ht ." (https://arxiv.org/pdf/1511.04108.pdf)

# references
- http://ben.bolte.cc/blog/2016/language.html
- https://arxiv.org/pdf/1508.01585v2.pdf
- https://arxiv.org/pdf/1511.04108.pdf
- https://explosion.ai/blog/deep-learning-formula-nlp