### Libraries

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
from keras.layers import *
from keras.models import Sequential, Model
from keras import backend as K
from keras import metrics
import re

### Config

In [None]:
batch_size = 32
epochs = 10
test_split = 0.0
intermediate_dim = 100
z_dim = 10
c_dim = 1
seq_length = 5
epsilon_std = 1.
optimizer = 'rmsprop'

file = 'robinson_crusoe.txt'

### Helper functions

In [None]:
def make_one_hot(index, count):
    one_hot = [0] * count
    one_hot[index] = 1
    return one_hot

## Text processing

In [None]:
class TextController:
    def __init__(self, text, seq_length):
        self.seq_length = seq_length
        self.text = self.format_text(text)
        self.chars = sorted(list(set(self.text)))
        self.n_vocab = len(self.chars)
        self.char_to_int = dict( (char, i) for i, char in enumerate(self.chars) )
        self.int_to_char = dict( (i, char) for i, char in enumerate(self.chars) )
        self.sequences = self.make_sequences()
        
    def format_text(self, text):
        text = text.lower()
        
        # Only for words
        real_chars = ['a', 'á', 'b', 'c', 'd', 'e', 'é', 'f', 'g', 'h', 'i', 'í', 'j', 'k', 'l', 'm', 'n',
                      'o', 'ó', 'ö', 'ő', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ü', 'ű', 'v', 'w', 'x', 'y', 'z']
        text = list(text)
        for i in range(len(text)):
            if text[i] not in real_chars:
                text[i] = ' '
        text = ''.join(text)
        # /Only for words
        
        format_items = [
            #{'from': '\n+', 'to': ' '},
            #{'from': '\r+', 'to': ' '},
            #{'from': '\t+', 'to': ' '},
            {'from': ' +', 'to': ' '},
        ]
        for format_item in format_items:
            text = re.sub(format_item['from'], format_item['to'], text)
        return text
    
    def make_words(self):
        words = set(self.text.split(' '))
        correct_words = []
        for word in words:
            if len(word) > 0 and len(word) <= self.seq_length:
                word += (self.seq_length - len(word)) * ' '
                correct_words.append(word)
        return correct_words

    def make_sequences(self):
        '''sequences = []
        for i in range(0, len(self.text) - self.seq_length):
            seq = self.text[i:i + self.seq_length]
            sequences.append(seq)'''
        
        sequences = self.make_words()
        sequences = [self.chars2nums(seq) for seq in sequences]
        for i in range(len(sequences)):
            for k in range(len(sequences[i])):
                sequences[i][k] = make_one_hot(sequences[i][k], self.n_vocab)
                
        sequences = np.reshape(sequences, (len(sequences), self.seq_length, self.n_vocab))
        return sequences
        
    def chars2nums(self, chars):
        return [self.char_to_int[char] for char in chars]
    
    def nums2chars(self, nums):
        return [self.int_to_char[num] for num in nums]
    
    def nums2str(self, nums):
        string = ''
        chars = self.nums2chars(nums)
        for char in chars:
            string += char
        return string

In [None]:
text = open(file).read()
TC = TextController(text, seq_length)

## Text generator

In [None]:
class TextGenerator:
    def __init__(self, n_chars):
        self.timesteps = seq_length
        self.original_dim = n_chars
        self.z_dim = z_dim
        self.c_dim = c_dim
        self.intermediate_dim = intermediate_dim
        self.epsilon_std = epsilon_std
        self.build_model()
        self.build_generator_model()
        
    def build_model(self):
        self.x, self.z_mean, self.z_log_sigma = self.build_encoder()
        self.z = Lambda(self.sampling, output_shape=(self.z_dim,))([self.z_mean, self.z_log_sigma])
        self.c = Input(shape=(self.c_dim,))
        self.z_c = concatenate([self.z, self.c])
        self.x_gen = self.build_generator()
        self.model = Model([self.x, self.c], self.x_gen)
        self.model.compile(optimizer=optimizer, loss=self.vae_loss)
        
    def build_generator_model(self):
        generator_input_z = Input(shape=(self.z_dim,))
        generator_input = concatenate( [generator_input_z, self.c] )
        h = generator_input
        for layer in self.generator_layers:
            h = layer(h)
        self.generator = Model([generator_input_z, self.c], h)
        
    def sampling(self, args):
        z_mean, z_log_sigma = args
        batch_size = K.shape(z_mean)[0]
        epsilon = K.random_normal(shape=(batch_size, self.z_dim), mean=0., stddev=self.epsilon_std)
        return z_mean + K.exp(z_log_sigma / 2) * epsilon
    
    def build_encoder(self):
        h = x = Input(shape=(self.timesteps, self.original_dim))
        #h = LSTM(self.intermediate_dim, activation='relu', return_sequences=True)(h)
        h = LSTM(self.intermediate_dim, activation='relu')(h)
        z_mean = Dense(self.z_dim)(h)
        z_log_sigma = Dense(self.z_dim)(h)
        return x, z_mean, z_log_sigma
        
    def build_generator(self):
        self.generator_layers = [
            RepeatVector(self.timesteps),
            #LSTM(self.intermediate_dim, activation='relu', return_sequences=True),
            LSTM(self.intermediate_dim, activation='relu', return_sequences=True),
            TimeDistributed(Dense(self.original_dim, activation='sigmoid')),
        ]
        h = self.z_c
        for layer in self.generator_layers:
            h = layer(h)
        return h
    
    def vae_loss(self, x, x_decoded_mean):
        xent_loss = self.original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
        kl_loss = - 0.5 * K.sum(K.sum(1 + self.z_log_sigma - K.square(self.z_mean) - K.exp(self.z_log_sigma)))
        return K.mean(xent_loss + kl_loss)
        #return K.mean(xent_loss)
    
    def train(self, x_train, c_train):
        x_train = np.array(x_train)
        c_train = np.array(c_train)
        self.model.fit(x=[x_train, c_train], y=x_train, validation_split=test_split, batch_size=batch_size, epochs=epochs)
    
    def predict(self, x, c):
        x = np.array([x])
        c = np.array([c])
        return self.model.predict([x, c])[0]
    
    def generate(self, z, c):
        z = np.array([z])
        c = np.array([c])
        return self.generator.predict([z, c])[0]

In [None]:
TG = TextGenerator(TC.n_vocab)

In [None]:
# Samples
x_samples = TC.sequences
c_samples = [0] * len(x_samples)

In [None]:
epochs = 50
TG.train(x_samples, c_samples)

In [None]:
for pred_sample in x_samples[:10]:
    predictions = TG.predict(pred_sample, 0)
    indexes = [np.argmax(prediction) for prediction in predictions]
    print(TC.nums2str(indexes))

In [None]:
for _ in range(10):
    input_z = np.random.normal(loc=0., scale=1., size=(z_dim,))
    generated = TG.generate(input_z, 0)
    indexes = [np.argmax(gen) for gen in generated]
    print(TC.nums2str(indexes))