In [1]:
'''FUNCTIONS'''
import pickle
from string import ascii_lowercase, digits
#from bs4 import BeautifulSoup, NavigableString
from collections import Counter
from tensorflow.keras.utils import Sequence

# Loading and saving files

def read_txt(path):
    return open(path, 'r', encoding='utf-8').read()

def save_txt(text, path):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)

def load_pickle(path):
    with open(path, 'rb') as handle:
        return pickle.load(handle)

def save_pickle(variable, path):
    with open(path, 'wb') as handle:
        pickle.dump(variable, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [2]:
'''TEXT FUNCTIONS'''

import re

class Text:
    def __init__(self, input_text, token2ind=None, ind2token=None):
        self.content = input_text
        self.tokens, self.tokens_distinct = self.tokenize()

        if token2ind != None and ind2token != None:
            self.token2ind, self.ind2token = token2ind, ind2token
        else:
            self.token2ind, self.ind2token = self.create_word_mapping(self.tokens_distinct)

        self.tokens_ind = [self.token2ind[token] if token in self.token2ind.keys() else self.token2ind['<| unknown |>']
                           for token in self.tokens]

    def __repr__(self):
        return self.content

    def __len__(self):
        return len(self.tokens_distinct)

    @staticmethod
    def create_word_mapping(values_list):
        values_list.append('<| unknown |>')
        value2ind = {value: ind for ind, value in enumerate(values_list)}
        ind2value = dict(enumerate(values_list))
        return value2ind, ind2value

    def preprocess(self):
        punctuation_pad = '!?.,:-;'
        punctuation_remove = '"()_\n'

        self.content_preprocess = re.sub(r'(\S)(\n)(\S)', r'\1 \2 \3', self.content)
        self.content_preprocess = self.content_preprocess.translate(str.maketrans('', '', punctuation_remove))
        self.content_preprocess = self.content_preprocess.translate(
            str.maketrans({key: ' {0} '.format(key) for key in punctuation_pad}))
        self.content_preprocess = re.sub(' +', ' ', self.content_preprocess)
        self.content = self.content_preprocess.strip()

    def tokenize(self):
        self.preprocess()
        tokens = self.content.split(' ')
        return tokens, list(set(tokens))

    def tokens_info(self):
        print('total tokens: %d, distinct tokens: %d' % (len(self.tokens), len(self.tokens_distinct)))

In [3]:
'''LSTM FUNCTIONS'''
#import re
import numpy as np
import tensorflow.keras

class Sequences():
    def __init__(self, text_object, max_len, step):
        self.tokens_ind = text_object.tokens_ind
        self.max_len = max_len
        self.step = step
        self.sequences, self.next_words = self.create_sequences()

    def __repr__(self):
        return 'Sequence object of max_len: %d and step: %d' % (self.max_len, self.step)

    def __len__(self):
        return len(self.sequences)

    def create_sequences(self):
        sequences = []
        next_words = []

        for i in range(0, len(self.tokens_ind) - self.max_len, self.step):
            sequences.append(self.tokens_ind[i: i +self.max_len])
            next_words.append(self.tokens_ind[ i +self.max_len])
        return sequences, next_words

    def sequences_info(self):
        print('number of sequences of length %d: %d' % (self.max_len, len(self.sequences)))


class ModelPredict():
    def __init__(self, model, prefix, token2ind, ind2token, max_len, embedding=False):
        self.model = model
        self.token2ind, self.ind2token = token2ind, ind2token
        self.max_len = max_len
        self.prefix = prefix
        self.tokens_ind = prefix.tokens_ind.copy()
        self.embedding = embedding

    def __repr__(self):
        return self.prefix.content

    def single_data_generation(self):
        single_sequence = np.zeros((1, self.max_len, len(self.token2ind)), dtype=np.bool)
        prefix = self.tokens_ind[-self.max_len:]

        for i, s in enumerate(prefix):
            single_sequence[0, i, s] = 1
        return single_sequence

    def model_predict(self):
        if self.embedding:
            model_input = np.array(self.tokens_ind).reshape(1,-1)
        else:
            model_input = self.single_data_generation()
        return self.model.predict(model_input)[0]

    @staticmethod
    def add_prob_temperature(prob, temperature=1):
        prob = prob.astype(float)
        prob_with_temperature = np.exp(np.where(prob == 0, 0, np.log(prob + 1e-10)) / temperature)
        prob_with_temperature /= np.sum(prob_with_temperature)
        return prob_with_temperature

    @staticmethod
    def reverse_preprocess(text):
        text_reverse = re.sub(r'\s+([!?"\'().,;-])', r'\1', text)
        text_reverse = re.sub(' +', ' ', text_reverse)
        return text_reverse

    def return_next_word(self, temperature=1, as_word=False):
        prob = self.model_predict()

        prob_with_temperature = self.add_prob_temperature(prob, temperature)
        next_word = np.random.choice(len(prob_with_temperature), p=prob_with_temperature)

        if as_word:
            return self.ind2token[next_word]
        else:
            return next_word

    def generate_sequence(self, k, append=False, temperature=1):
        for i in range(k):
            next_word = self.return_next_word(temperature=temperature)
            self.tokens_ind.append(next_word)
        return_tokens_ind = self.tokens_ind
        return_tokens_ind = ' '.join([self.ind2token[ind] for ind in return_tokens_ind])

        if not append:
            self.tokens_ind = self.prefix.tokens_ind.copy()

        return self.reverse_preprocess(return_tokens_ind)

    def bulk_generate_sequence(self, k, n, temperature=1):
        for i in range(n):
            print(self.generate_sequence(k, temperature=temperature))
            print('\n')


class TextDataGenerator(Sequence):
    def __init__(self, sequences, next_words, sequence_length, vocab_size, batch_size=32, shuffle=True, embedding=False):
        self.batch_size = batch_size
        self.sequences = sequences
        self.next_words = next_words
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.shuffle = shuffle
        self.embedding = embedding
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.sequences) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]
        sequences_batch = [self.sequences[k] for k in indexes]
        next_words_batch = [self.next_words[k] for k in indexes]

        if self.embedding:
            X = np.array(sequences_batch)
            y = keras.utils.to_categorical(next_words_batch, num_classes=self.vocab_size)
        else:
            X, y = self.__data_generation(sequences_batch, next_words_batch)

        return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.sequences))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, sequences_batch, next_words_batch):
        X = np.zeros((self.batch_size, self.sequence_length, self.vocab_size), dtype=np.bool)
        y = np.zeros((self.batch_size, self.vocab_size), dtype=np.bool)

        for i, seq in enumerate(sequences_batch):
            for j, word in enumerate(seq):
                X[i, j, word] = 1
                y[i, next_words_batch[i]] = 1
        return X, y

In [4]:
'''START HERE'''
from tensorflow.keras import layers, models, optimizers

'''PREPROCESSING'''
path_train = 'emma_book_only.txt'

input_train = read_txt(path_train)


max_len = 4
step = 3

text_train = Text(input_train)
text_train.tokens_info()

seq_train = Sequences(text_train, max_len, step)
seq_train.sequences_info()


print(text_train.tokens[:10])
print(text_train.tokens_ind[:10], '\n')

np.array(seq_train.sequences[:2])


batch_size = 32

params = {
  'sequence_length': max_len,
  'vocab_size': len(text_train),
  'batch_size': batch_size,
  'shuffle': True
}

train_generator = TextDataGenerator(seq_train.sequences, seq_train.next_words, **params)


total tokens: 185790, distinct tokens: 9861
number of sequences of length 4: 61929
['\ufeffCHAPTER', 'IEmma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich']
[6211, 112, 4876, 9442, 6379, 9442, 1997, 9442, 9635, 2602] 



In [5]:
'''TRAIN LSTM MODEL'''
def lstm_model(sequence_length, vocab_size, layer_size, embedding=False):
    model = models.Sequential()
    if embedding:
        model.add(layers.Embedding(vocab_size, layer_size))
        model.add(layers.LSTM(layer_size))    
    else:
        model.add(layers.LSTM(layer_size, input_shape=(sequence_length, vocab_size)))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(vocab_size, activation='softmax'))
    return model


In [6]:
model = lstm_model(max_len, len(text_train), 32)

optimizer = optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [7]:
model.fit(train_generator,
          steps_per_epoch=len(train_generator),
          epochs=50,
          verbose=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1942278e308>

In [8]:
#model.save('data/out/lstm_model')
#model = models.load_model('data/out/lstm_model')

In [8]:
'''TEXT GENERATION'''
token2ind, ind2token = text_train.token2ind, text_train.ind2token

input_prefix = 'Elizabeth'
text_prefix = Text(input_prefix, token2ind, ind2token)


pred = ModelPredict(model, text_prefix, token2ind, ind2token, max_len)


temperatures = [1, 0.7, 0.4, 0.1]

for temperature in temperatures:
    print('temperature:', temperature)
    print(pred.generate_sequence(200, temperature=0.7))
    print('\n')

temperature: 1


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Elizabeth heard; : well the. to be to be very much as to It was a the so to Emma! and had a of a very good and about the the been, and the as a of his very; and that he could not have been a in she. The to be, in him to be, but you must have a very angry as for a than I seemed as I am sure that friend for the. and. Mr. Elton, as Mr. she might be at that a more to be very much a much to be very much in a very well at much of the. It was so in Emma. The it would have been in, he did have to of me. ” said he, the is to have not one of the Mr. Knightley, it would have been a; and she had been to be as Mr. Mr. ” said him; that she had been in all the her. She she could not would have here the very good the very. Mr


temperature: 0.7
Elizabeth if quite every to and had own with the a very a very; and in Mr. she might be very much. Mr. Knightley. He was very very much in a to his own. It was so to be the very to to a be at all or thing, as I am sure you were so good. He do her to am to send,