In [4]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import warnings

In [38]:
#sample text 
text = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """

In [39]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
encoded = tokenizer.texts_to_sequences([text])[0]
encoded

[2,
 1,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 2,
 14,
 15,
 1,
 16,
 17,
 18,
 1,
 3,
 19,
 20,
 21]

In [40]:
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text

	for _ in range(n_words):
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		yhat = model.predict_classes(encoded, verbose=0)
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		in_text += ' ' + out_word
	return in_text

In [41]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size , "\n")

# create line-based sequences
sequences = list()

# 2 words -> 1 word model
for i in range(2, len(encoded)):
	sequence = encoded[i-2:i+1]
	sequences.append(sequence)
    
# for line in text.split('\n'):
#     encoded = tokenizer.texts_to_sequences([line])[0]
#     if(len(encoded)!=0):
#         print(line , "---->", encoded,'\n')
#     for i in range(1, len(encoded)):
#         sequence = encoded[:i+1]
#         sequences.append(sequence)
        
print('Total Sequences: %d' % len(sequences))

Vocabulary Size: 22 

Total Sequences: 23


In [42]:
sequences

[[2, 1, 3],
 [1, 3, 4],
 [3, 4, 5],
 [4, 5, 6],
 [5, 6, 7],
 [6, 7, 8],
 [7, 8, 9],
 [8, 9, 10],
 [9, 10, 11],
 [10, 11, 12],
 [11, 12, 13],
 [12, 13, 2],
 [13, 2, 14],
 [2, 14, 15],
 [14, 15, 1],
 [15, 1, 16],
 [1, 16, 17],
 [16, 17, 18],
 [17, 18, 1],
 [18, 1, 3],
 [1, 3, 19],
 [3, 19, 20],
 [19, 20, 21]]

In [43]:
# pad input sequences

max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 3


In [44]:
sequences

array([[ 2,  1,  3],
       [ 1,  3,  4],
       [ 3,  4,  5],
       [ 4,  5,  6],
       [ 5,  6,  7],
       [ 6,  7,  8],
       [ 7,  8,  9],
       [ 8,  9, 10],
       [ 9, 10, 11],
       [10, 11, 12],
       [11, 12, 13],
       [12, 13,  2],
       [13,  2, 14],
       [ 2, 14, 15],
       [14, 15,  1],
       [15,  1, 16],
       [ 1, 16, 17],
       [16, 17, 18],
       [17, 18,  1],
       [18,  1,  3],
       [ 1,  3, 19],
       [ 3, 19, 20],
       [19, 20, 21]])

In [45]:
# get input X and output y from the sequences

sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [46]:
X[0] , y[0]

(array([2, 1]),
 array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.], dtype=float32))

In [47]:
# select model and pass respective parameters

model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2, 10)             220       
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [48]:
# train the model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y, epochs=500, verbose=2).history

Epoch 1/500
1/1 - 0s - loss: 3.0915 - accuracy: 0.0000e+00
Epoch 2/500
1/1 - 0s - loss: 3.0907 - accuracy: 0.0435
Epoch 3/500
1/1 - 0s - loss: 3.0898 - accuracy: 0.0435
Epoch 4/500
1/1 - 0s - loss: 3.0890 - accuracy: 0.0435
Epoch 5/500
1/1 - 0s - loss: 3.0881 - accuracy: 0.0435
Epoch 6/500
1/1 - 0s - loss: 3.0873 - accuracy: 0.0870
Epoch 7/500
1/1 - 0s - loss: 3.0864 - accuracy: 0.0870
Epoch 8/500
1/1 - 0s - loss: 3.0855 - accuracy: 0.0870
Epoch 9/500
1/1 - 0s - loss: 3.0846 - accuracy: 0.0870
Epoch 10/500
1/1 - 0s - loss: 3.0837 - accuracy: 0.1304
Epoch 11/500
1/1 - 0s - loss: 3.0828 - accuracy: 0.1304
Epoch 12/500
1/1 - 0s - loss: 3.0819 - accuracy: 0.1304
Epoch 13/500
1/1 - 0s - loss: 3.0809 - accuracy: 0.1304
Epoch 14/500
1/1 - 0s - loss: 3.0799 - accuracy: 0.1304
Epoch 15/500
1/1 - 0s - loss: 3.0789 - accuracy: 0.1304
Epoch 16/500
1/1 - 0s - loss: 3.0779 - accuracy: 0.1304
Epoch 17/500
1/1 - 0s - loss: 3.0768 - accuracy: 0.1304
Epoch 18/500
1/1 - 0s - loss: 3.0757 - accuracy: 0.13

In [49]:
# evaluate model
warnings.filterwarnings('ignore')
print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))

Jack and jill came tumbling after after
And Jill came tumbling after
fell down and broke his crown and
pail of water jack fell down and
