Word-level text generator using an embedded layer to learn
word representation and an LSTM RNN to predict words.
example from https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

In [25]:
# load the text
def load_doc(filename):
    file=open(filename, 'r')
    text=file.read()
    file.close()
    return text
in_filename = '/Users/annejones/Documents/nn/datasets/anh-script.txt'
doc = load_doc(in_filename)
print(doc[:200])


               A long time ago, in a galaxy far, far, away...

               A vast sea of stars serves as the backdrop for the main title. 
               War drums echo through the heavens as a ro


In [26]:
import string

# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [27]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['a', 'long', 'time', 'ago', 'in', 'a', 'galaxy', 'far', 'far', 'away', 'a', 'vast', 'sea', 'of', 'stars', 'serves', 'as', 'the', 'backdrop', 'for', 'the', 'main', 'title', 'war', 'drums', 'echo', 'through', 'the', 'heavens', 'as', 'a', 'rollup', 'slowly', 'crawls', 'into', 'infinity', 'it', 'is', 'a', 'period', 'of', 'civil', 'war', 'rebel', 'spaceships', 'striking', 'from', 'a', 'hidden', 'base', 'have', 'won', 'their', 'first', 'victory', 'against', 'the', 'evil', 'galactic', 'empire', 'during', 'the', 'battle', 'rebel', 'spies', 'managed', 'to', 'steal', 'secret', 'plans', 'to', 'the', 'empires', 'ultimate', 'weapon', 'the', 'death', 'star', 'an', 'armored', 'space', 'station', 'with', 'enough', 'power', 'to', 'destroy', 'an', 'entire', 'planet', 'pursued', 'by', 'the', 'empires', 'sinister', 'agents', 'princess', 'leia', 'races', 'home', 'aboard', 'her', 'starship', 'custodian', 'of', 'the', 'stolen', 'plans', 'that', 'can', 'save', 'her', 'people', 'and', 'restore', 'freedom', 't

In [28]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 32644


In [29]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
    # save sequences to file
out_filename = '/Users/annejones/Documents/nn/datasets/pp_sequences.txt'
save_doc(sequences, out_filename)

In [30]:
# load and split based on new lines
in_filename = '/Users/annejones/Documents/nn/datasets/pp_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [31]:
# integer encode sequences of words
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [32]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [33]:
# separate into input and output and one
# hot encode the output
from tensorflow.python.keras.utils import to_categorical
from numpy import array
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [34]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import LSTM
from tensorflow.python.keras.layers import Embedding
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 50)            196200    
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 3924)              396324    
Total params: 743,424
Trainable params: 743,424
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras._impl.keras.callbacks.History at 0x116cb3128>

In [38]:
from pickle import dump
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [45]:
# use the model to generate text
from tensorflow.python.keras.models import load_model
from pickle import load
from random import randint
in_filename = '/Users/annejones/Documents/nn/datasets/pp_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split())-1
model = load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [69]:
seq_length

50

In [76]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [86]:
# get seed text
seed_text = lines[randint(0, len(lines))]
generated = generate_seq(model, tokenizer, 
                         seq_length, seed_text, 20)
print(seed_text + '...')
print(generated)

few feet of lukes face luke doesnt move and the ball backs off it slowly moves behind the boy then makes another quick lunge this time emitting a blood red laser beam as it attacks it hits luke in the leg causing him to tumble over han lets loose with a...
small small pistol and starts to the old jedi luke is a few moments luke i think you think he
