In [62]:
from keras.layers import ELU, Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Model
from scipy import spatial
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import numpy as np
import codecs
import csv
import os

### Directories and text loading
Initially we will set the main directories and some variables regarding the characteristics of our texts.
We set the maximum sequence length to 15, the maximun number of words in our vocabulary to 12000 and we will use 50-dimensional embeddings. Finally we load our texts from a csv. The text file is the train file of the Quora Kaggle challenge containing around 808000 sentences.

In [63]:
%%writefile get_data.sh
if [ ! -f quora.csv ]; then
  wget -O quora.csv https://www.dropbox.com/scl/fi/wxvgvw6y48whtuvcx1quq/questions.csv?rlkey=03yokqc36sht66me4jgzmbu12&dl=0
fi

if [ ! -f glove.6B.100d.txt ]; then
  wget -O glove.6B.100d.txt https://www.dropbox.com/s/dl1vswq2sz5f1ws/glove.6B.100d.txt?dl=0
fi

Overwriting get_data.sh


In [64]:
!bash get_data.sh

In [65]:
TRAIN_DATA_FILE = './quora.csv'
GLOVE_EMBEDDING = './glove.6B.100d.txt'
VALIDATION_SPLIT = 0.2
MAX_SEQUENCE_LENGTH = 15
MAX_NB_WORDS = 12000
EMBEDDING_DIM = 100

texts = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts.append(values[3])
        texts.append(values[4])
print('Found %s texts in train.csv' % len(texts))

Found 808702 texts in train.csv


In [66]:
texts[3]

'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'

## Embedding

In [67]:
path_to_glove_file = "./glove.6B.100d.txt"
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400001 word vectors.


### Text Preprocessing
To preprocess the text we will use the tokenizer and the text_to_sequences function from Keras


In [68]:
tokenizer = Tokenizer(MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
index2word = {v: k for k, v in word_index.items()}
print('Found %s unique tokens' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data_1 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data_1.shape)
NB_WORDS = (min(tokenizer.num_words, len(word_index)) + 1 ) #+1 for zero padding


Found 95603 unique tokens
Shape of data tensor: (808702, 15)


In [69]:
import numpy
numpy.random.shuffle(data_1)
training, test = data_1[:int(len(data_1)*VALIDATION_SPLIT)], data_1[int(len(data_1)*VALIDATION_SPLIT):]

### Sentence generator
In order to reduce the memory requirements we will gradually read our sentences from the csv through Pandas as we feed them to the model

In [70]:
def sent_generator(chunksize):
    reader = pd.read_csv(TRAIN_DATA_FILE, chunksize=chunksize, iterator=True)
    for df in reader:
        val3 = df.iloc[:,3:4].values.tolist()
        val4 = df.iloc[:,4:5].values.tolist()
        flat3 = [item for sublist in val3 for item in sublist]
        flat4 = [str(item) for sublist in val4 for item in sublist]
        texts = []
        texts.extend(flat3[:])
        texts.extend(flat4[:])

        sequences = tokenizer.texts_to_sequences(texts)
        data_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
        yield (data_train, data_train)

In [71]:
next(sent_generator(50))

(array([[   0,    2,    3, ...,  383,    8,   35],
        [   0,    0,    0, ...,   10,    5, 4565],
        [   0,    4,   13, ...,  146,    6, 2773],
        ...,
        [   0,    0,    0, ...,   22,    1,  140],
        [   0,    0,    0, ...,   33, 6892,  730],
        [   0,    0,    0, ...,    7,   52,  283]], dtype=int32),
 array([[   0,    2,    3, ...,  383,    8,   35],
        [   0,    0,    0, ...,   10,    5, 4565],
        [   0,    4,   13, ...,  146,    6, 2773],
        ...,
        [   0,    0,    0, ...,   22,    1,  140],
        [   0,    0,    0, ...,   33, 6892,  730],
        [   0,    0,    0, ...,    7,   52,  283]], dtype=int32))

### Word embeddings
We will use pretrained Glove word embeddings as embeddings for our network. We create a matrix with one embedding for every word in our vocabulary and then we will pass this matrix as weights to the keras embedding layer of our model

In [72]:
glove_embedding_matrix = np.zeros((NB_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < NB_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be the word embedding of 'unk'.
            glove_embedding_matrix[i] = embedding_vector
        else:
            glove_embedding_matrix[i] = embeddings_index.get('unk')
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))

Null word embeddings: 1


### VAE model
Our model is based on a seq2seq architecture with a bidirectional LSTM encoder and an LSTM decoder and SELU activations.
We feed the latent representation at every timestep as input to the decoder through "RepeatVector(max_len)".

We use the sum of the BCE loss on the final sentences generated + the KL loss from the Sampling layer.

Moreover, due to the pandas iterator that reads the csv both the train size and validation size must be divisible by the batch_size.

In [73]:
batch_size = 100
max_len = MAX_SEQUENCE_LENGTH
emb_dim = EMBEDDING_DIM
latent_dim = 32
intermediate_dim = 96
epsilon_std = 1.0
num_sampled=500

In [74]:
class Sampling(tf.keras.layers.Layer):
  def call(self, inputs):
    """Generates a random sample and combines with the encoder output

    Args:
      inputs -- output tensor from the encoder

    Returns:
      `inputs` tensors combined with a random sample
    """

    # implement
    pass

In [75]:
from keras.initializers import Constant

def encoder_layers(inputs, latent_dim):
  """Defines the encoder's layers.
  Args:
    inputs -- batch from the dataset
    latent_dim -- dimensionality of the latent space

  Returns:
    mu -- learned mean
    sigma -- learned standard deviation
    batch_2.shape -- shape of the features before flattening
  """

  x_embed = Embedding(NB_WORDS, emb_dim, embeddings_initializer=Constant(glove_embedding_matrix), input_length=max_len, trainable=False, name='embedding')(inputs)
  h = Bidirectional(LSTM(intermediate_dim, return_sequences=False, recurrent_dropout=0.2), merge_mode='concat', name='bidirectional_lstm_1')(x_embed)
  h = Dropout(0.2)(h)
  h = Dense(intermediate_dim, activation='relu')(h)
  h = Dropout(0.2)(h)
  mu = Dense(latent_dim, name='latent_mu')(h)
  sigma = Dense(latent_dim, name='latent_sigma')(h)
  return mu, sigma, intermediate_dim

In [76]:
def encoder_model(latent_dim, input_shape):
  """Defines the encoder model with the Sampling layer
  Args:
    latent_dim -- dimensionality of the latent space
    input_shape -- shape of the dataset batch

  Returns:
    model -- the encoder model
    conv_shape -- shape of the features before flattening
  """

  # declare the inputs tensor with the given shape
  inputs = None

  # get the output of the encoder_layers() function
  mu, sigma, shape = None

  # feed mu and sigma to the Sampling layer
  z = None

  # build the whole encoder model
  model = None

  return model, shape

In [77]:
def decoder_layers(inputs, shape):
  """Defines the decoder layers.
  Args:
    inputs -- output of the encoder
    shape -- shape of the features before flattening

  Returns:
    tensor containing the decoded output
  """

  x = RepeatVector(max_len)(inputs)
  x = LSTM(intermediate_dim, return_sequences=True, recurrent_dropout=0.2)(x)
  x = TimeDistributed(Dense(NB_WORDS, activation='softmax'))(x)
  return x

In [78]:
def decoder_model(latent_dim, shape):
  """Defines the decoder model.
  Args:
    latent_dim -- dimensionality of the latent space
    shape -- shape of the features before flattening

  Returns:
    model -- the decoder model
  """

  # set the inputs to the shape of the latent space
  inputs = None

  # get the output of the decoder layers
  outputs = None

  # declare the inputs and outputs of the model
  model = None

  return model

In [79]:
def kl_reconstruction_loss(mu, sigma):
  """ Computes the Kullback-Leibler Divergence (KLD)
  Args:
    mu -- mean
    sigma -- standard deviation

  Returns:
    KLD loss
  """
  kl_loss = 1 + sigma - tf.square(mu) - tf.math.exp(sigma)
  kl_loss = tf.reduce_mean(kl_loss) * -0.5

  return kl_loss

In [80]:
def vae_model(encoder, decoder, input_shape):
  """Defines the VAE model
  Args:
    encoder -- the encoder model
    decoder -- the decoder model
    input_shape -- shape of the dataset batch

  Returns:
    the complete VAE model
  """

  # set the inputs
  inputs = tf.keras.layers.Input(shape=input_shape)

  # get mu, sigma, and z from the encoder output
  mu, sigma, z = encoder(inputs)

  # get reconstructed output from the decoder
  reconstructed = decoder(z)

  # define the inputs and outputs of the VAE
  model = tf.keras.Model(inputs=inputs, outputs=reconstructed)

  # add the KL loss
  kl_loss = kl_reconstruction_loss(mu, sigma)
  model.add_loss(kl_loss)

  return model, kl_loss

In [81]:
def get_models(input_shape, latent_dim):
  """Returns the encoder, decoder, and vae models"""
  encoder, shape = encoder_model(latent_dim=latent_dim, input_shape=input_shape)
  decoder = decoder_model(latent_dim=latent_dim, shape=shape)
  vae, kl_loss = vae_model(encoder, decoder, input_shape=input_shape)
  return encoder, decoder, vae, kl_loss

In [82]:
encoder, decoder, vae, kl_loss = get_models(input_shape=(max_len), latent_dim=latent_dim)



In [83]:
optimizer = tf.keras.optimizers.Adam()
def custom_loss(y_true, y_pred):
    print(y_true)
    print(y_pred)
    flattened_inputs = tf.cast(tf.reshape(y_true, shape=[-1]), dtype=tf.float32)
    flattened_outputs = tf.cast(tf.reshape(tf.math.argmax(y_pred, axis=2), shape=[-1]), dtype=tf.float32)
    bce_loss = tf.keras.losses.BinaryCrossentropy()(flattened_inputs, flattened_outputs) * max_len * batch_size
    total_loss = bce_loss + kl_loss
    return total_loss

In [84]:
vae.compile(optimizer=optimizer, loss=custom_loss)


In [85]:
vae.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 15)]                 0         []                            
                                                                                                  
 model_3 (Functional)        [(None, 32),                 1376132   ['input_6[0][0]']             
                              (None, 32),                                                         
                              (None, 32)]                                                         
                                                                                                  
 model_4 (Functional)        (None, 15, 12001)            1213633   ['model_3[0][2]']             
                                                                                            

In [88]:
# # Training loop.

epochs = 2

for epoch in range(epochs):
  print('Start of epoch %d' % (epoch + 1,))

  # iterate over the batches of the dataset.
  for step, (x_train, x_train) in enumerate(sent_generator(batch_size/2)):
    with tf.GradientTape() as tape:

      # feed a batch to the VAE model
      reconstructed = vae(tf.constant(x_train))
      # add KLD regularization loss
      loss = vae.losses

    # get the gradients and update the weights
    grads = tape.gradient(loss, vae.trainable_weights)
    optimizer.apply_gradients(
    (grad, var)
    for (grad, var) in zip(grads, vae.trainable_variables)
    if grad is not None
    )
    loss_metric = tf.keras.metrics.Mean()
    # compute the loss metric
    loss_metric(loss)

    # display outputs every 100 steps
    if step % 100 == 0:
      print('Epoch: %s step: %s mean loss = %s' % (epoch + 1, step, loss_metric.result().numpy()))
    if step % 1000 == 0 and step != 0:
      break

Start of epoch 0
Epoch: 1 step: 0 mean loss = 0.0041612466
Epoch: 1 step: 100 mean loss = 2.8871e-08
Epoch: 1 step: 200 mean loss = 5.525723e-07
Epoch: 1 step: 300 mean loss = 1.6713328e-06
Epoch: 1 step: 400 mean loss = 1.0272488e-08
Epoch: 1 step: 500 mean loss = 3.7252903e-09
Epoch: 1 step: 600 mean loss = 9.450596e-07
Epoch: 1 step: 700 mean loss = 2.2117048e-07
Epoch: 1 step: 800 mean loss = 9.313226e-10
Epoch: 1 step: 900 mean loss = 2.7939677e-09
Epoch: 1 step: 1000 mean loss = 2.7939677e-09
Start of epoch 1
Epoch: 2 step: 0 mean loss = 9.313226e-09
Epoch: 2 step: 100 mean loss = 7.450581e-09
Epoch: 2 step: 200 mean loss = 3.7252903e-09
Epoch: 2 step: 300 mean loss = 5.5879354e-09
Epoch: 2 step: 400 mean loss = 9.313226e-10
Epoch: 2 step: 500 mean loss = 5.5879354e-09
Epoch: 2 step: 600 mean loss = 1.540035e-07
Epoch: 2 step: 700 mean loss = 1.0775402e-08
Epoch: 2 step: 800 mean loss = -9.313226e-10
Epoch: 2 step: 900 mean loss = 9.313226e-09
Epoch: 2 step: 1000 mean loss = 4.65

### Project and sample sentences from the latent space
Now we build an encoder model model that takes a sentence and projects it on the latent space and a decoder model that goes from the latent space back to the text representation

### Test on validation sentences

In [89]:
index2word = {v: k for k, v in word_index.items()}
sent_encoded = encoder.predict(test[:10000], batch_size = 16)
x_test_reconstructed = decoder.predict(sent_encoded[0])

sent_idx = 672
reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[sent_idx])
#np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx])
#np.max(np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx]))
word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
word_list
original_sent = list(np.vectorize(index2word.get)(test[sent_idx]))
original_sent



[None,
 'is',
 'ios',
 'supposed',
 'to',
 'clear',
 'out',
 'your',
 'music',
 'library',
 'when',
 'you',
 'cross',
 'national',
 'borders']