In [1]:
import os
import json
import numpy as np
import tensorflow as tf
from scipy import spatial
import tensorflow_addons as tfa
# tf.compat.v1.disable_eager_execution()
# tf.config.experimental_run_functions_eagerly(True)
from sklearn.model_selection import train_test_split

In [2]:
epoch = 500
lr = 0.0005
batch_size = 256

intr_dim = 256
latent_dim = int(intr_dim * 0.5)

droprate = 0.2
kl_weight = 0.01

max_length = 50 #time steps
emb_dim = 300

BASE_DIR = './annotations/'
use_all_data = True
num_sent = 10

In [3]:
def load_data(dataset, use_all_data, num_sent, BASE_DIR):
    if 'train' in dataset:
        path = BASE_DIR + 'captions_train2014.json'
    else:
        path = BASE_DIR + 'captions_val2014.json'

    with open(path, 'r') as f:
        annotations = json.load(f)

    captions = []
    for c in annotations['annotations']:
        caption = f"<start> {c['caption']} <end>"
        captions.append(caption)
        
    if use_all_data:
        captions = captions
        print('Total data size: ', len(captions), flush=True)
    else:
        captions = captions[:num_sent]
        print('Temp data size: ', len(captions), flush=True)    
        
    return captions

def create_emb_ind(glove_path, emb_dim):
    total_path = glove_path + 'glove.6B.' + str(emb_dim) + 'd.txt'
    f = open(total_path, encoding='utf8')
    embeddings_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index), flush=True)
    return embeddings_index

def make_emb_mat(embeddings_index, total_words, emb_dim, word_index, dataset, save):
    glove_embedding_matrix = np.zeros((total_words, emb_dim))
    for word, i in word_index.items():
        if i < total_words:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                glove_embedding_matrix[i] = embedding_vector
            else:
                # if words not found in embedding index will be the word embedding of 'unk'
                glove_embedding_matrix[i] = embeddings_index.get('unk')
    
    print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0), flush=True)
    print('Check Null: ', np.isnan(np.sum(glove_embedding_matrix)), flush=True)
    print('Emb vector shape: ', glove_embedding_matrix.shape, flush=True)
    
    if save:
        np.save('./annotations/' + dataset + '.npy', glove_embedding_matrix)
    return glove_embedding_matrix


# In[3]:


train_captions = load_data('train', use_all_data, num_sent, BASE_DIR)
val_captions = load_data('val', use_all_data, num_sent, BASE_DIR)
joint_list = train_captions + val_captions
print(len(joint_list), flush=True)


# In[4]:


tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="unk", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(joint_list)

tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

word_index = tokenizer.word_index
index2word = {v: k for k, v in word_index.items()}
print('Found %s unique tokens' % len(word_index), flush=True)

sequences = tokenizer.texts_to_sequences(joint_list)

data_1 = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
print('Shape of data tensor:', data_1.shape, flush=True)

vocab_size = len(word_index)


# In[5]:


create = False
if create:
    embeddings_index = create_emb_ind(BASE_DIR, emb_dim)
    glove_embedding_matrix = make_emb_mat(embeddings_index, vocab_size, emb_dim, word_index, 'train', True)
else:
    glove_embedding_matrix = np.load(BASE_DIR + 'train.npy')
    print(glove_embedding_matrix.shape, flush=True)


# In[6]:


x_train, x_val = train_test_split(data_1, shuffle=True, test_size=len(val_captions)/len(joint_list), random_state=28) #len(val_captions)/len(joint_list)
print(x_train.shape, x_val.shape, flush=True)

Total data size:  414113
Total data size:  202654
616767
Found 28444 unique tokens
Shape of data tensor: (616767, 50)
(28444, 300)
(414113, 50) (202654, 50)


In [4]:
####################################################################################

class Sampling(tf.keras.layers.Layer):
    def __init__(self):
        super(Sampling, self).__init__()
        self.supports_masking = True
    
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        epsilon = tf.random.normal([batch, latent_dim])
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class custom_lstm(tf.keras.layers.Layer):
    def __init__(self, intr_dim, droprate,  **kwargs):
        self.bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(intr_dim, recurrent_dropout=droprate, 
                                                                          return_sequences=False), merge_mode='concat')
        self.drop_layer = tf.keras.layers.Dropout(droprate)
        super(custom_lstm, self).__init__(**kwargs)
    
    def call(self, inputs):
        h = self.bi_lstm(inputs)
        h = self.drop_layer(h)
        return h
    
    def compute_mask(self, inputs, mask=None):
        return mask
    
x = tf.keras.layers.Input(shape=(max_length,))
embed_layer = tf.keras.layers.Embedding(vocab_size, emb_dim, input_length=max_length, weights=[glove_embedding_matrix], 
                                        trainable=False, mask_zero=True)
encoder_layer = custom_lstm(intr_dim, droprate)

h = embed_layer(x)
h = encoder_layer(h)
z_mean = tf.keras.layers.Dense(latent_dim, name='z_mean')(h)
z_log_var = tf.keras.layers.Dense(latent_dim, name='z_log_var')(h)
z = Sampling()([z_mean, z_log_var])

####################################################################################

class custom_decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, intr_dim, max_length, droprate, **kwargs):
        self.rpv = tf.keras.layers.RepeatVector(max_length)
        self.lstm_layer_1 = tf.keras.layers.LSTM(intr_dim, return_sequences=True, recurrent_dropout=droprate)
        self.droplayer_2 = tf.keras.layers.Dropout(droprate)
        self.lstm_layer_2 = tf.keras.layers.LSTM(intr_dim*2, return_sequences=True, recurrent_dropout=droprate)
        self.droplayer_3 = tf.keras.layers.Dropout(droprate)
        self.decoded_logits = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(vocab_size, activation='linear'))
        super(custom_decoder, self).__init__(**kwargs)
    
    def call(self, inputs):
        h = self.rpv(inputs)
        h = self.lstm_layer_1(h)
        h = self.droplayer_2(h)
        h = self.lstm_layer_2(h)
        h = self.droplayer_3(h)
        decoded = self.decoded_logits(h)
        return decoded
    
    def compute_mask(self, inputs, mask=None):
        return mask
    
decoder_layer = custom_decoder(vocab_size, intr_dim, max_length, droprate)
decoded_logits = decoder_layer(z)

####################################################################################

class ELBO_Layer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(ELBO_Layer, self).__init__(**kwargs)
    
    def call(self, inputs, mask=None):
        broadcast_float_mask = tf.cast(mask, "float32")
        labels = tf.cast(x, tf.int32)
        reconstruction_loss = tf.reduce_sum(tfa.seq2seq.sequence_loss(inputs, labels, 
                                                                      weights=broadcast_float_mask,
                                                                      average_across_timesteps=False,
                                                                      average_across_batch=False), axis=1)
        
        kl_loss = - 0.5 * tf.reduce_sum(1 + z_log_var - tf.math.square(z_mean) - tf.math.exp(z_log_var), axis=1)
        total_loss = tf.reduce_mean(reconstruction_loss + kl_weight * kl_loss)
        self.add_loss(total_loss, inputs=[x, inputs])
        return tf.ones_like(x)
    
    def compute_mask(self, inputs, mask=None):
        return mask
        
elbo_layer = ELBO_Layer()
fake_decoded_prob = elbo_layer(decoded_logits)

####################################################################################

def zero_loss(y_true, y_pred):
    return tf.zeros_like(y_pred)

def kl_loss(x, fake_decoded_prob):
    kl_loss = - 0.5 * tf.reduce_sum(1 + z_log_var - tf.math.square(z_mean) - tf.math.exp(z_log_var), axis=1)
    kl_loss = kl_weight * kl_loss
    return tf.reduce_mean(kl_loss)

vae = tf.keras.models.Model(x, fake_decoded_prob, name='VAE', trainable=False)
opt = tf.keras.optimizers.Adam(lr=lr)
vae.compile(optimizer=opt, loss=[zero_loss], metrics=[kl_loss])
vae.summary()
vae.trainable = False

for i, l in enumerate(vae.layers):
    print(f'layer {i}: {l}', flush=True)
    print(f'has input mask: {l.input_mask}', flush=True)
    print(f'has output mask: {l.output_mask}', flush=True)

####################################################################################

name = 'weights.207-3.54.h5'
directory = 'output4'
vae.load_weights('./' + directory + '/' + name)
print('model weights loaded', flush=True)

Model: "VAE"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 300)      8533200     input_1[0][0]                    
__________________________________________________________________________________________________
custom_lstm (custom_lstm)       (None, 512)          1140736     embedding[0][0]                  
__________________________________________________________________________________________________
z_mean (Dense)                  (None, 128)          65664       custom_lstm[0][0]                
________________________________________________________________________________________________

In [5]:
# build a model to project sentences on the latent space
encoder = tf.keras.models.Model(x, z, name='encoder')
encoder.summary()

for i, l in enumerate(encoder.layers):
    print(f'layer {i}: {l}', flush=True)
    print(f'has input mask: {l.input_mask}', flush=True)
    print(f'has output mask: {l.output_mask}', flush=True)

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 300)      8533200     input_1[0][0]                    
__________________________________________________________________________________________________
custom_lstm (custom_lstm)       (None, 512)          1140736     embedding[0][0]                  
__________________________________________________________________________________________________
z_mean (Dense)                  (None, 128)          65664       custom_lstm[0][0]                
____________________________________________________________________________________________

In [6]:
# build a generator that can sample sentences from the learned distribution
ins = tf.keras.layers.Input(shape=(latent_dim,))
x_logits = decoder_layer(ins)
generator = tf.keras.models.Model(ins, x_logits, name='decoder')
generator.summary()

for i, l in enumerate(generator.layers):
    print(f'layer {i}: {l}', flush=True)
    print(f'has input mask: {l.input_mask}', flush=True)
    print(f'has output mask: {l.output_mask}', flush=True)

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
custom_decoder (custom_decod (None, 50, 28444)         16560924  
Total params: 16,560,924
Trainable params: 0
Non-trainable params: 16,560,924
_________________________________________________________________
layer 0: <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x2b370c740d50>
has input mask: None
has output mask: None
layer 1: <__main__.custom_decoder object at 0x2b370c457ad0>
has input mask: Tensor("embedding/NotEqual:0", shape=(None, 50), dtype=bool)
has output mask: Tensor("embedding/NotEqual:0", shape=(None, 50), dtype=bool)


In [7]:
def TemporalSoftmax(inputs, mask):
    broadcast_float_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
    inputs_exp = tf.math.exp(inputs) * broadcast_float_mask
    inputs_sum = tf.reduce_sum(inputs * broadcast_float_mask, axis=1, keepdims=True)
    out_prob = inputs_exp / inputs_sum
    return out_prob

def create_mask(inputs):
    return tf.cast(tf.cast(inputs, tf.dtypes.bool), tf.float32)

In [8]:
#test on a validation sentence

for i in range(20):
    sent_idx = i
    
    sent_encoded = encoder.predict(x_val[sent_idx:sent_idx+2,:])

    x_test_reconstructed = tf.nn.softmax(generator.predict(sent_encoded, batch_size = 1))
    
    reconstructed_indexes = clean_sent(np.apply_along_axis(np.argmax, 1, x_test_reconstructed[0]), 4)

    original_sent = [i for i in np.vectorize(index2word.get)(x_val[sent_idx]) if '<pad>' not in i]
    print('orig:', ' '.join(original_sent))

    word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
    print('pred:', ' '.join(word_list))

    print('\n')

orig: <start> a man walks with a young girl hand and hand as she carries an umbrella <end>
pred: <start> a man walks with a young girl hand and hand as she carries an umbrella <end>


orig: <start> three motorcyclists and their bikes stand on the pavement <end>
pred: <start> three motorcyclists and their bikes stand on the pavement <end>


orig: <start> a desk with two computer monitors and a mouse <end>
pred: <start> a desk with two computer monitors and a mouse <end>


orig: <start> a pigeon sits on a curb on the side of a road <end>
pred: <start> a pigeon sits on a curb on the side of a road <end>


orig: <start> two people flying a kite over a snowy field <end>
pred: <start> two people flying a kite over a snowy field <end>


orig: <start> two men excited about the very large pieces of pizza they are holding <end>
pred: <start> two men excited about the very large piece of them they are holding <end>


orig: <start> two men on motorcycles in front of building <end>
pred: <start> tw

In [9]:
sent_encoded = encoder.predict(x_val[:10000])

In [10]:
def sent_parse(sentence):
    sequence = tokenizer.texts_to_sequences(sentence)
    padded_sent = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_length, padding='post')
    return padded_sent


# input: encoded sentence vector
# output: encoded sentence vector in dataset with highest cosine similarity
def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec


# input: two points, integer n
# output: n equidistant points on the line between the input points (inclusive)
def shortest_homology(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample


# input: original dimension sentence vector
# output: sentence text
def print_latent_sentence(sent_vect, name):
    sent_vect = np.reshape(sent_vect,[1,latent_dim])
    sent_reconstructed = tf.nn.softmax(generator.predict(sent_vect))
    sent_reconstructed = np.reshape(sent_reconstructed, [max_length, vocab_size])
    reconstructed_indexes = clean_sent(np.apply_along_axis(np.argmax, 1, sent_reconstructed), 4)
    word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
    w_list = [w for w in word_list if w not in ['<pad>']]
    print(name, ' '.join(w_list), flush=True)
       

def new_sents_interp(sent1, sent2, n, name):
    tok_sent1 = sent_parse(sent1)
    tok_sent2 = sent_parse(sent2)
    enc_sent1 = encoder.predict(tok_sent1, batch_size = 1)
    enc_sent2 = encoder.predict(tok_sent2, batch_size = 1)
    test_hom = shortest_homology(enc_sent1, enc_sent2, n)
    for point in test_hom:
        print_latent_sentence(point, name)

In [11]:
sentence1=['<start> cat sitting on top of a table <end>']
mysent = sent_parse(sentence1)
print(mysent)

mysent_encoded = encoder.predict(mysent, batch_size = 1)
print_latent_sentence(mysent_encoded, 'example1: ')
print_latent_sentence(find_similar_encoding(mysent_encoded), 'similar sent1: ')

[[ 3 49 14  5 33  6  2 25  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]]
example1:  <start> cat sitting on top of a table <end>
similar sent1:  <start> a striped cat sitting on top of a table <end>


In [12]:
sentence2=['<start> a dog relaxing with a cat <end>']
mysent2 = sent_parse(sentence2)
print(mysent2)

mysent_encoded2 = encoder.predict(mysent2, batch_size = 1)
print_latent_sentence(mysent_encoded2, 'example2: ')
print_latent_sentence(find_similar_encoding(mysent_encoded2), 'similar sent2: ')

[[   3    2   47 1168    9    2   49    4    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
example2:  <start> a dog relaxing with a cat <end>
similar sent2:  <start> a train stopped at a train station <end>


In [13]:
# get shortest homology
new_sents_interp(sentence1, sentence2, 10, 'interp: ')

interp:  <start> cat sitting on top of a table <end>
interp:  <start> cat sitting on top of a table <end>
interp:  <start> cat sitting on top of a table <end>
interp:  <start> cat sitting on top of a table <end>
interp:  <start> a cat sitting on a a <end>
interp:  <start> a dog sleeping with a a <end>
interp:  <start> a dog sleeping with a cat <end>
interp:  <start> a dog relaxing with a cat <end>
interp:  <start> a dog relaxing with a cat <end>
interp:  <start> a dog relaxing with a cat <end>


In [23]:
#test on a random sentences
random_sent = tf.random.normal([1, latent_dim,])
x_test_reconstructed = tf.nn.softmax(generator.predict(random_sent, batch_size = 1))

reconstructed_indexes = clean_sent(np.apply_along_axis(np.argmax, 1, x_test_reconstructed[0]), 4)
print(reconstructed_indexes)

print()

# in case of repeating word, change distribution of the next predicted word based on the current word (restricting the predictions)
word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
print('pred:', ' '.join(word_list))

[3, 15, 224, 8, 213, 213, 1664, 5, 948, 5, 5, 589, 10, 589, 10, 2, 4]

pred: <start> an image in being being peeking on what on on reading and reading and a <end>
