# Text corpus to generative text

Imports

In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np

from keras.utils import plot_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Add data

Tokenize

In [2]:
tokenizer = Tokenizer()

Variables

In [None]:
epochSize = 100
hidden_size = 200

In [4]:
def dataset_preparation(data):
    
    # splits corpus on return - change if you want a different split
    corpus = data.lower().split("\n") 
    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    
    return predictors, label, max_sequence_len, total_words

## Training

In [None]:
def create_model(predictors, label, max_sequence_len, total_words):

    model = Sequential()
    model.add(Embedding(total_words, hidden_size, input_length=max_sequence_len-1))
    model.add(LSTM(hidden_size, return_sequences = True))
    
    #model.add(LSTM(125))
    
    model.add(LSTM(hidden_size))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
    model.fit(predictors, label, epochs=epochSize, verbose=1, callbacks=[earlystop])
    return model 

In [6]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [42]:
data = open('data.txt').read()

In [None]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)
model = create_model(predictors, label, max_sequence_len, total_words)
plot_model(model, to_file='model.png')

output = generate_text("The", 30, max_sequence_len)
print(output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [36]:
plot_model(model, to_file='model.png')

In [40]:
output = generate_text("So", 10, max_sequence_len)
print(output)

So in in in in in in in in in in
