In [16]:
import numpy as np
from nltk.tokenize import word_tokenize
from lstm_vae import create_lstm_vae, inference
import keras
import sys, time
from keras.callbacks import CSVLogger, Callback

from tensorflow import set_random_seed
set_random_seed(1234)
np.random.seed(1234)


def get_text_data(data_path, num_samples=1000):

    # vectorize the data
    input_texts = []
    input_characters = set(["\t"])

    with open(data_path, "r", encoding="utf-8") as f:
        #lines = f.read().lower().split('\n')
        lines = f.read().lower().split('\n')
        
    for line in lines[: min(num_samples, len(lines) - 1)]:

        #input_text, _= line.split('\t')
        input_text = line
        input_text = word_tokenize(input_text)
        input_text.append("<end>")

        input_texts.append(input_text)

        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)

    input_characters = sorted(list(input_characters))
    num_encoder_tokens = len(input_characters)
    max_encoder_seq_length = max([len(txt) for txt in input_texts]) + 1

    print("Number of samples:", len(input_texts))
    print("Number of unique input tokens:", num_encoder_tokens)
    print("Max sequence length for inputs:", max_encoder_seq_length)

    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())

    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
    decoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")

    for i, input_text in enumerate(input_texts):
        decoder_input_data[i, 0, input_token_index["\t"]] = 1.0

        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
            decoder_input_data[i, t + 1, input_token_index[char]] = 1.0

    return max_encoder_seq_length, num_encoder_tokens, input_characters, input_token_index, reverse_input_char_index, \
           encoder_input_data, decoder_input_data


def main(params):
    
    num_samples = int(params['num_samples'])
    data_path = "data/" + params['dataset']
    dataname = params['dataset'].split('.')[0]
    
    batch_size = int(params['batch_size'])
    latent_dim = int(params['latent_dim'])
    intermediate_dim = int(params['intermediate_dim'])
    epochs = int(params['epochs'])
    
    train = int(params['train'])
    save = int(params['save'])
    load = int(params['load'])
    
    timesteps_max, enc_tokens, characters, char2id, id2char, x, x_decoder = get_text_data(num_samples=num_samples,
                                                                                          data_path=data_path)

    print(x.shape, "Creating model...")
    
    input_dim = x.shape[-1]
    timesteps = x.shape[-2]
    
    if load:
        print("Loading model ... ")
        
        #vae = keras.models.load_model("models/vae_{}.h5".format(dataname))
        enc = keras.models.load_model("models/encoder_{}.h5".format(dataname))
        gen = keras.models.load_model("models/generator_{}.h5".format(dataname))
        stepper = keras.models.load_model("models/stepper_{}.h5".format(dataname))
    
    if train:
        print("Training model...")
        
        vae, enc, gen, stepper = create_lstm_vae(input_dim,
                                             batch_size=batch_size,
                                             intermediate_dim=intermediate_dim,
                                             latent_dim=latent_dim)

        
        csv_logger = CSVLogger('training_vae.log', separator=',', append=False)
        vae.fit([x, x_decoder], x, epochs=epochs, verbose=1, callbacks=[csv_logger])
        
        if save:
            print("Saving model ... ")
            
            vae.save("models/vae_{}.h5".format(dataname))
            enc.save("models/encoder_{}.h5".format(dataname))
            gen.save("models/generator_{}.h5".format(dataname))
            stepper.save("models/stepper_{}.h5".format(dataname))
    
    print("Fitted, predicting...")


    def decode(s, start_char = "\t"):
        return inference.decode_sequence(s, gen, stepper, input_dim, char2id, id2char, timesteps_max, start_char = start_char)

    def continue_seq(x_start):
        return inference.continue_sequence(x_start, gen, stepper, input_dim, char2id, id2char, timesteps_max)

    for _ in range(5):

        id_from = np.random.randint(0, x.shape[0] - 1)
        id_to = np.random.randint(0, x.shape[0] - 1)

        m_from, std_from = enc.predict([[x[id_from]]])
        m_to, std_to = enc.predict([[x[id_to]]])

        seq_from = np.random.normal(size=(latent_dim,))
        seq_from = m_from + std_from * seq_from

        seq_to = np.random.normal(size=(latent_dim,))
        seq_to = m_to + std_to * seq_to

        print("==  \t", " ".join([id2char[j] for j in np.argmax(x[id_from], axis=1)]), "==")

        for v in np.linspace(0, 1, 7):
            print("%.2f\t" % (1 - v), decode(v * seq_to + (1 - v) * seq_from))

        print("==  \t", " ".join([id2char[j] for j in np.argmax(x[id_to], axis=1)]), "==")
        
    for _ in range(20):
        id_sentence = np.random.randint(0, x.shape[0] - 1)
        
        n_words = np.sum(x[id_sentence])
        n_kept = np.random.randint(n_words//2, n_words-1)
        
        new_x = np.zeros((x[id_sentence].shape))
        new_x[:n_kept,:] = x[id_sentence,:n_kept,:]
        
        print("==  \t", " ".join([id2char[j] for j in np.argmax(new_x[:n_kept], axis=1)]), " ... \t\t ==")
        
        print("\t...\t", continue_seq(new_x))
            
        print("==  \t", " ".join([id2char[j] for j in np.argmax(x[id_sentence], axis=1)]), "==")
        

In [9]:
data_path = './data/europ.en' 
with open(data_path, "r", encoding="utf-8") as f:
        #lines = f.read().lower().split('\n')
        lines = f.read().lower().split('\n')

In [12]:
lines[2]

"although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."

In [18]:
config_file_name = 'config.txt'

with open(config_file_name, 'r') as f:
    lines = f.readlines()

params = {}

for line in lines:
    line = line.split('\n')[0]
    param_list = line.split(' ')
    param_name = param_list[0]
    param_value = param_list[1]
    params[param_name] = param_value

main(params)

Number of samples: 100
Number of unique input tokens: 750
Max sequence length for inputs: 117
(100, 117, 750) Creating model...
Training model...
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, None, 750)    0                                            
__________________________________________________________________________________________________
lstm_9 (LSTM)                   (None, 353)          1558848     input_21[0][0]                   
__________________________________________________________________________________________________
dense_17 (Dense)                (None, 191)          67614       lstm_9[0][0]                     
__________________________________________________________________________________________________
dense_18 (Dense)                (None, 191)          67614    

Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200


Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200


Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
Saving model ... 


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


Fitted, predicting...
==  	 but , madam president , my personal request has not been met . <end> 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ==
1.00	 i would like to advice about rule 143 concerning inadmissibility . <end> 
0.83	 i would like to advice about rule 143 concerning inadmissibility . <end> 
0.67	 i would like to advice about rule 143 concerning inadmissibility . <end> 
0.50	 i would like to advice about rule 143 concerning inadmissibility . <end> 
0.33	 i would like to advice about rule 143 concerning inadmissibility . <end> 
0.17	 i would like to advice about rule 143 concerning inadmissibility . <end> 
0.00	 i would like to advice about rule 143 concerning inadmissibility . <end> 
==  	 the cunha report on multiannual guidance programmes comes before parliament on thursday and contains a proposal in paragraph 6 t

	...	 . <end> 
==  	 given that the commission is represented by vice-president de palacio , i believe that , before voting , it would help if the commission could let us know how ready it is to present this programme , as agreed . alternatively , parliament is not ready to examine this programme , as some appear to be suggesting . <end> 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ==
==  	 but , madam president , my personal request has not been met  ... 		 ==
	...	 . <end> 
==  	 but , madam president , my personal request has not been met . <end> 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ==
==  	 therefore , madam president , i would ask you to request that the commission express its opinion  ... 		 ==
	...	 , we had agreed - , the commission is not a 