In [2]:
import numpy as np
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = np.argmax(model.predict(encoded, verbose=0))
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

# define the model
def define_model(vocab_size, max_length):
	model = Sequential()
	model.add(Embedding(vocab_size, 10, input_length=max_length-1))
	model.add(LSTM(50))
	model.add(Dense(vocab_size, activation='softmax'))
	# compile network
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize defined model
	model.summary()
	plot_model(model, to_file='model.png', show_shapes=True)
	return model

# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """
# prepare the tokenizer on the source text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# create line-based sequences
sequences = list()
for line in data.split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print(sequences)
print('Total Sequences: %d' % len(sequences))
# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
print(X.shape)
print(y.shape)
# define model
model = define_model(vocab_size, max_length)

Vocabulary Size: 22
[[2, 1], [2, 1, 3], [2, 1, 3, 4], [2, 1, 3, 4, 5], [2, 1, 3, 4, 5, 6], [2, 1, 3, 4, 5, 6, 7], [8, 9], [8, 9, 10], [8, 9, 10, 11], [8, 9, 10, 11, 12], [8, 9, 10, 11, 12, 13], [2, 14], [2, 14, 15], [2, 14, 15, 1], [2, 14, 15, 1, 16], [2, 14, 15, 1, 16, 17], [2, 14, 15, 1, 16, 17, 18], [1, 3], [1, 3, 19], [1, 3, 19, 20], [1, 3, 19, 20, 21]]
Total Sequences: 21
Max Sequence Length: 7
(21, 6)
(21, 22)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6, 10)             220       
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 22)                1122      
                                                                 
Total params: 13,542
Trainable p

In [4]:
# fit network
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
1/1 - 2s - loss: 3.0909 - accuracy: 0.0952 - 2s/epoch - 2s/step
Epoch 2/500
1/1 - 0s - loss: 3.0894 - accuracy: 0.0476 - 8ms/epoch - 8ms/step
Epoch 3/500
1/1 - 0s - loss: 3.0879 - accuracy: 0.1429 - 8ms/epoch - 8ms/step
Epoch 4/500
1/1 - 0s - loss: 3.0864 - accuracy: 0.0952 - 10ms/epoch - 10ms/step
Epoch 5/500
1/1 - 0s - loss: 3.0849 - accuracy: 0.0952 - 8ms/epoch - 8ms/step
Epoch 6/500
1/1 - 0s - loss: 3.0834 - accuracy: 0.0952 - 9ms/epoch - 9ms/step
Epoch 7/500
1/1 - 0s - loss: 3.0818 - accuracy: 0.0952 - 8ms/epoch - 8ms/step
Epoch 8/500
1/1 - 0s - loss: 3.0801 - accuracy: 0.0952 - 7ms/epoch - 7ms/step
Epoch 9/500
1/1 - 0s - loss: 3.0784 - accuracy: 0.0952 - 7ms/epoch - 7ms/step
Epoch 10/500
1/1 - 0s - loss: 3.0766 - accuracy: 0.0952 - 7ms/epoch - 7ms/step
Epoch 11/500
1/1 - 0s - loss: 3.0748 - accuracy: 0.0952 - 9ms/epoch - 9ms/step
Epoch 12/500
1/1 - 0s - loss: 3.0728 - accuracy: 0.0952 - 9ms/epoch - 9ms/step
Epoch 13/500
1/1 - 0s - loss: 3.0708 - accuracy: 0.0952 - 8ms

<keras.callbacks.History at 0x7fd5b951e050>

Running the example achieves a better fit on the source data. The added context has allowed
the model to disambiguate some of the examples. There are still two lines of text that start
with “Jack” that may still be a problem for the network.

In [8]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack', 4))
print(generate_seq(model, tokenizer, max_length-1, 'Jill', 4))

Jack fell down and broke
Jill jill came tumbling after


This was a good example of how the framing may result in better new lines, but not good partial lines of input.
