# LSTM-VAE

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import keras
import codecs
import csv
import os
import random

from keras.layers import Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout, Masking
from keras.preprocessing.sequence import pad_sequences
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras import metrics
from keras.models import Model
from scipy import spatial
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Import, clean, and encode input data

In [3]:
imported_data = pd.read_hdf("test_database.h5")
mofids = imported_data["MOFid"].tolist()
print(mofids[1])

[BiH2][BiH2].C1(=CC=C(C=C1)(C(O)=O))(C(O)=O).COc1cc((C(O)=O))c(OC)cc1(C(O)=O) MOFid-v1.bew.cat0;comment


In [4]:
def clean_inputs(mofid, cleaned_list):
    for i in range(len(mofid)):
        unclean = mofid[i]
        spaces_removed = unclean.replace(' ','')
        version_removed = spaces_removed.replace('MOFid-v1', '')
        comment_removed = version_removed.replace(';comment', '')
        clean = comment_removed.replace('.cat0', '')
        cleaned_list.append(clean)

clean_mofids = []
clean_inputs(mofids, clean_mofids)
max_mofid_length = max([len(mofid) for mofid in clean_mofids])
print(clean_mofids[1])

[BiH2][BiH2].C1(=CC=C(C=C1)(C(O)=O))(C(O)=O).COc1cc((C(O)=O))c(OC)cc1(C(O)=O).bew


In [5]:
tk = Tokenizer(num_words=300, char_level=True, lower=False, oov_token='UNK')
tk.fit_on_texts(clean_mofids)
char_index = tk.word_index
vocab_size = len(tk.word_index)
index2char = {v: k for k, v in char_index.items()}

print("Found %s unique tokens" % vocab_size)
print(char_index)

full_sequences = tk.texts_to_sequences(clean_mofids)
full_data_padded = pad_sequences(full_sequences, maxlen=max_mofid_length, padding='post')

print('Shape of data tensor:', full_data_padded.shape)

# separate off a validation set
train_data = np.array(full_data_padded, dtype="int")
train_split, val_split = train_test_split(train_data, test_size = 0.2, random_state = 42)

embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))

for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

print('Shape of embedding weights tensor:',embedding_weights.shape)
#print(embedding_weights)

Found 52 unique tokens
{'UNK': 1, '(': 2, ')': 3, 'O': 4, 'C': 5, 'c': 6, '=': 7, '1': 8, '[': 9, ']': 10, '.': 11, 'n': 12, 'F': 13, 'N': 14, '2': 15, '+': 16, 'l': 17, '@': 18, 'u': 19, '3': 20, 'Z': 21, '#': 22, '4': 23, '6': 24, 'q': 25, 'o': 26, 'H': 27, 'd': 28, 'P': 29, 'i': 30, 'e': 31, '-': 32, 'R': 33, 'S': 34, '5': 35, 'b': 36, 'M': 37, 'z': 38, 'h': 39, 'p': 40, 'g': 41, 'r': 42, 's': 43, 'B': 44, 't': 45, 'T': 46, 'W': 47, 'w': 48, 'j': 49, 'a': 50, 'x': 51, 'y': 52}
Shape of data tensor: (10651, 485)
Shape of embedding weights tensor: (53, 52)


In [None]:
# Vectorize the data.
#input_texts = []
#input_characters = set()
#input_characters.add('UNK')

#for i in range(len(clean_mofids)):
#    input_text = clean_mofids[i]
#    input_texts.append(input_text)
#    for char in input_text:
#        if char not in input_characters:
#            input_characters.add(char)

#input_characters = sorted(list(input_characters))
#num_encoder_tokens = len(input_characters)
#max_encoder_seq_length = max([len(txt) for txt in input_texts])

#print("Number of samples:", len(input_texts))
#print("Number of unique input tokens:", num_encoder_tokens)
#print("Max sequence length for inputs:", max_encoder_seq_length)

#input_token_index = dict([(char, i+1) for i, char in enumerate(input_characters)])
#print(input_token_index)

#encoder_input_data = np.zeros(
#    (len(input_texts), max_encoder_seq_length, num_encoder_tokens+1), dtype="int"
#)

#print(input_texts[0])

#for i, input_text in enumerate(input_texts):
#    for t, char in enumerate(input_text):
#        encoder_input_data[i, t, input_token_index[char]] = 1.0

#np.set_printoptions(threshold=np.inf)
#print(len(encoder_input_data[0][0]))
#np.set_printoptions(threshold=1000)

#train_data, validation_data = train_test_split(encoder_input_data, test_size = 0.2, random_state = 42)

A LSTM network expects the input to be in the form [samples, time steps, features] where samples is the number of data points we have, time steps is the number of time-dependent steps that are there in a single data point, features refers to the number of variables we have for the corresponding true value in Y. 

## Build the LSTM-VAE model

In [13]:
batch_size = 71
max_len = max_mofid_length
embedding_size = vocab_size
intermediate_dim = vocab_size+1
latent_dim = 26
epsilon_std = 1.0
kl_weight = 0.01
num_sampled=500

x = Input(shape=(max_len,))
x_embed = Embedding(vocab_size+1, embedding_size, mask_zero=True, input_length=max_len, weights=[embedding_weights])(x)
h = Bidirectional(LSTM(intermediate_dim, return_sequences=False, recurrent_dropout=0.2), merge_mode='concat')(x_embed)
#h = Dropout(0.2)(h)
#h = Dense(intermediate_dim, activation='linear')(h)
#h = act(h)
#h = Dropout(0.2)(h)
z_mean = Dense(latent_dim, name="z_mean")(h)
z_log_var = Dense(latent_dim, name="z_log_var")(h)


def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

repeated_context = RepeatVector(max_len)
decoder_h = LSTM(vocab_size, return_sequences=True, recurrent_dropout=0.2)
decoder_mean = Dense(vocab_size+1, activation='linear')#softmax is applied in the seq2seqloss by tf
h_decoded = decoder_h(repeated_context(z))
x_decoded_mean = decoder_mean(h_decoded)

# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)
        self.target_weights = tf.constant(np.ones((batch_size, max_len)), tf.float32)

    def vae_loss(self, x, x_decoded_mean):
        #xent_loss = K.sum(metrics.categorical_crossentropy(x, x_decoded_mean), axis=-1)
        labels = tf.cast(x, tf.int32)
        xent_loss = K.sum(tfa.seq2seq.sequence_loss(x_decoded_mean, labels, 
                                                     weights=self.target_weights,
                                                     average_across_timesteps=False,
                                                     average_across_batch=False), axis=-1)#,
                                                     #softmax_loss_function=softmax_loss_f), axis=-1)#,
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        xent_loss = K.mean(xent_loss)
        kl_loss = K.mean(kl_loss)
        return K.mean(xent_loss + kl_weight * kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        print(x.shape, x_decoded_mean.shape)
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)

def kl_loss(x, x_decoded_mean):
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    kl_loss = kl_weight * kl_loss
    return kl_loss
    
loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
opt = Adam(lr=0.01)

vae.compile(optimizer='adam', loss=[zero_loss], metrics=[kl_loss])
vae.summary()

(None, 485) (71, 485, 53)
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 485)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 485, 52)      2756        input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 106)          44944       embedding_2[0][0]                
__________________________________________________________________________________________________
z_mean (Dense)                  (None, 26)           2782        bidirectional_2[0][0]            
__________________________________________________________________

## Train model

In [22]:
attempt = 4

def create_model_checkpoint(dir, model_name):
    filepath = dir + '/' + model_name + ".h5" 
    directory = os.path.dirname(filepath)
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    checkpointer = ModelCheckpoint(filepath=filepath, verbose=1, save_best_only=True)
    return checkpointer

checkpointer = create_model_checkpoint('models', f'lstm_vae_attempt{attempt}_best_weights')

def batch_generator(dataset, chunk_size, epochs):
    train_batches = []
    for i in range(epochs):
        epoch = i
        train_batch = dataset[(epoch*chunk_size):((epoch+1)*chunk_size)]
        train_batches.append(train_batch)
    return train_batches

def val_batch_generator(dataset, chunk_size, epochs):
    val_batches = []
    for i in range(epochs):
        randint = random.randint(1, (len(dataset)-chunk_size))
        val_batch = dataset[(randint):(randint+chunk_size)]
        val_batches.append(val_batch)
    return val_batches

n_epoch=120

train_batches = batch_generator(dataset=train_split, chunk_size=batch_size, epochs=n_epoch)
val_batches = val_batch_generator(dataset=val_split, chunk_size=batch_size, epochs=n_epoch)

#n_steps =  int(len(batches[0])/batch_size)
n_steps = 30

for counter in range(n_epoch):
    print('-------epoch: ',counter,'--------')
    vae.fit(train_batches[counter], train_batches[counter],
                      shuffle=True,
                      steps_per_epoch=n_steps,
                      epochs=1,
                      validation_data=(val_batches[counter], val_batches[counter]), validation_steps=n_steps,
                      callbacks=[checkpointer])
    print(K.eval(vae.optimizer.lr))
    #K.set_value(vae.optimizer.lr, 0.01)

vae.save(f'models/lstm_vae_attempt{attempt}.h5')

-------epoch:  0 --------


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 71 samples, validate on 71 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 27.48015, saving model to models/lstm_vae_attempt4_best_weights.h5
0.001
-------epoch:  1 --------
Train on 71 samples, validate on 71 samples
Epoch 1/1

Epoch 00001: val_loss improved from 27.48015 to 26.12427, saving model to models/lstm_vae_attempt4_best_weights.h5
0.001
-------epoch:  2 --------
Train on 71 samples, validate on 71 samples
Epoch 1/1

Epoch 00001: val_loss did not improve from 26.12427
0.001
-------epoch:  3 --------
Train on 71 samples, validate on 71 samples
Epoch 1/1

Epoch 00001: val_loss improved from 26.12427 to 24.49352, saving model to models/lstm_vae_attempt4_best_weights.h5
0.001
-------epoch:  4 --------
Train on 71 samples, validate on 71 samples
Epoch 1/1

Epoch 00001: val_loss improved from 24.49352 to 21.14740, saving model to models/lstm_vae_attempt4_best_weights.h5
0.001
-------epoch:  5 --------
Train on 71 samples, validate on 71 samples
Epoch 1/1

Epo

KeyboardInterrupt: 

## Test model

In [14]:
vae.load_weights('models/lstm_vae_best_weights.h5')

# build a model to project sentences on the latent space
encoder = Model(x, z_mean)
encoder.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 485)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 485, 52)           2756      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 106)               44944     
_________________________________________________________________
z_mean (Dense)               (None, 26)                2782      
Total params: 50,482
Trainable params: 50,482
Non-trainable params: 0
_________________________________________________________________


In [15]:
# build a generator that can sample sentences from the learned distribution
decoder_input = Input(shape=(latent_dim,))
_h_decoded = decoder_h(repeated_context(decoder_input))
_x_decoded_mean = decoder_mean(_h_decoded)
_x_decoded_mean = Activation('softmax')(_x_decoded_mean)
generator = Model(decoder_input, _x_decoded_mean)
generator.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 26)                0         
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 485, 26)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 485, 52)           16432     
_________________________________________________________________
dense_2 (Dense)              (None, 485, 53)           2809      
_________________________________________________________________
activation_2 (Activation)    (None, 485, 53)           0         
Total params: 19,241
Trainable params: 19,241
Non-trainable params: 0
_________________________________________________________________


In [9]:
index2char = {v: k for k, v in char_index.items()}
index2char[0] = '0' # padding value

mofid_index = 20
mofid_encoded = encoder.predict(val_split)
x_test_reconstructed = generator.predict(mofid_encoded, batch_size = 1)
reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[0])

print(reconstructed_indexes)

word_list = list(np.vectorize(index2char.get)(reconstructed_indexes))
word_list = ''.join(map(str, word_list))
print('Reconstructed MOFid: ', word_list)

#original_mofid = []
#for i in range(len(validation_data[mofid_index])):
#    original_mofid.append(index2char[np.argmax(random_val[mofid_index][i])])
#original_sent = list(np.vectorize(index2word.get)(random_val[mofid_index]))
#original_mofid = ''.join(map(str, original_mofid))
#print('Original MOFid: ',original_mofid)

[ 9 21 10 10 10 10 10  9  9  9  9  9  9  9  9  9 10 10 10 10  5  5  5  5
  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  2  2
  2  2  2  2  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0

In [24]:
#model = keras.models.load_model('models/lstm_vae_attempt3.h5', 
#                                custom_objects={'CustomVariationalLayer': CustomVariationalLayer})
vae.evaluate()


ValueError: If evaluating from data tensors, you should specify the `steps` argument.