We can use an intermediate between the one-word-in and the whole-sentence-in approaches
and pass in a sub-sequences of words as input. This will provide a trade-off between the two
framings allowing new lines to be generated and for generation to be picked up mid line. We will
use 3 words as input to predict one word as output.

In [3]:
import numpy as np
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length)
		# predict probabilities for each word
		yhat = np.argmax(model.predict(encoded, verbose=0))
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

# define the model
def define_model(vocab_size, max_length):
	model = Sequential()
	model.add(Embedding(vocab_size, 10, input_length=max_length-1))
	model.add(LSTM(50))
	model.add(Dense(vocab_size, activation='softmax'))
	# compile network
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize defined model
	model.summary()
	#plot_model(model, to_file='model.png', show_shapes=True)
	return model

# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
print(encoded)
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
	sequence = encoded[i-2:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
print(sequences)
# pad sequences
max_length = max([len(seq) for seq in sequences])
# sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
print(X.shape)
print(y.shape)

[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]
Vocabulary Size: 22
Total Sequences: 23
[[2, 1, 3], [1, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8], [7, 8, 9], [8, 9, 10], [9, 10, 11], [10, 11, 12], [11, 12, 13], [12, 13, 2], [13, 2, 14], [2, 14, 15], [14, 15, 1], [15, 1, 16], [1, 16, 17], [16, 17, 18], [17, 18, 1], [18, 1, 3], [1, 3, 19], [3, 19, 20], [19, 20, 21]]
Max Sequence Length: 3
(23, 2)
(23, 22)


In [4]:
# define model
model = define_model(vocab_size, max_length)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 10)             220       
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 22)                1122      
                                                                 
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [6]:
# fit network
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
1/1 - 7s - loss: 3.0901 - accuracy: 0.0870 - 7s/epoch - 7s/step
Epoch 2/500
1/1 - 0s - loss: 3.0892 - accuracy: 0.1739 - 10ms/epoch - 10ms/step
Epoch 3/500
1/1 - 0s - loss: 3.0883 - accuracy: 0.1304 - 9ms/epoch - 9ms/step
Epoch 4/500
1/1 - 0s - loss: 3.0873 - accuracy: 0.1304 - 9ms/epoch - 9ms/step
Epoch 5/500
1/1 - 0s - loss: 3.0864 - accuracy: 0.1304 - 8ms/epoch - 8ms/step
Epoch 6/500
1/1 - 0s - loss: 3.0855 - accuracy: 0.1304 - 8ms/epoch - 8ms/step
Epoch 7/500
1/1 - 0s - loss: 3.0845 - accuracy: 0.1304 - 7ms/epoch - 7ms/step
Epoch 8/500
1/1 - 0s - loss: 3.0835 - accuracy: 0.1304 - 8ms/epoch - 8ms/step
Epoch 9/500
1/1 - 0s - loss: 3.0825 - accuracy: 0.1304 - 8ms/epoch - 8ms/step
Epoch 10/500
1/1 - 0s - loss: 3.0815 - accuracy: 0.1304 - 9ms/epoch - 9ms/step
Epoch 11/500
1/1 - 0s - loss: 3.0804 - accuracy: 0.1304 - 8ms/epoch - 8ms/step
Epoch 12/500
1/1 - 0s - loss: 3.0794 - accuracy: 0.1304 - 7ms/epoch - 7ms/step
Epoch 13/500
1/1 - 0s - loss: 3.0783 - accuracy: 0.1304 - 7ms

<keras.callbacks.History at 0x7f5f60189590>

In [12]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))

Jack and jill went up the hill
And Jill went up the
fell down and broke his crown and
pail of water jack fell down and
