In [7]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
# source text
data = """ Jack and Jill went up the hill\n To fetch a pail of water\n Jack fell down and broke his crown\n And Jill came tumbling after\n """
print (type(data), data)

<class 'str'>  Jack and Jill went up the hill
 To fetch a pail of water
 Jack fell down and broke his crown
 And Jill came tumbling after
 


In [21]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
print (type(encoded), len(encoded), encoded)

<class 'list'> 25 [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]


In [10]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


In [18]:
#strip newlines, split on spaces, convert to lowercase, count unique words.
import re
data2 = re.sub('[\n]', '', data).lower()
print (data2)
print (len(set(data2.split(" "))))

 jack and jill went up the hill to fetch a pail of water jack fell down and broke his crown and jill came tumbling after 


22

In [19]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
	sequence = encoded[i-1:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 24


In [23]:
# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]
print (type(X), X.shape)
print (type(y), y.shape)

<class 'numpy.ndarray'> (24,)
<class 'numpy.ndarray'> (24,)


In [24]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

In [25]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
 - 1s - loss: 3.0907 - acc: 0.0417
Epoch 2/500
 - 0s - loss: 3.0899 - acc: 0.0000e+00
Epoch 3/500
 - 0s - loss: 3.0891 - acc: 0.1667
Epoch 4/500
 - 0s - loss: 3.0883 - acc: 0.1250
Epoch 5/500
 - 0s - loss: 3.0875 - acc: 0.1250
Epoch 6/500
 - 0s - loss: 3.0867 - acc: 0.1250
Epoch 7/500
 - 0s - loss: 3.0859 - acc: 0.1250
Epoch 8/500
 - 0s - loss: 3.0851 - acc: 0.1250
Epoch 9/500
 - 0s - loss: 3.0843 - acc: 0.1250
Epoch 10/500
 - 0s - loss: 3.0834 - acc: 0.1250
Epoch 11/500
 - 0s - loss: 3.0826 - acc: 0.1250
Epoch 12/500
 - 0s - loss: 3.0817 - acc: 0.1250
Epoch 13/500
 - 0s - loss: 3.0809 - acc: 0.1250
Epoch 14/500
 - 0s - loss: 3.0800 - acc: 0.1250
Epoch 15/500
 - 0s - loss: 3.0791 - acc: 0.1250
Epoch 16/500
 - 0s - loss: 3.0782 - acc: 0.1250
Epoch 17/500
 - 0s - loss: 3.0772 - acc: 0.1250
Epoch 18/500
 - 0s - loss: 3.0763 - acc: 0.1250
Epoch 19/500
 - 0s - loss: 3.0753 - acc: 0.1250
Epoch 20/500
 - 0s - loss: 3.0743 - acc: 0.1250
Epoch 21/500
 - 0s - loss: 3.0733 - acc: 0.12

Epoch 171/500
 - 0s - loss: 1.9261 - acc: 0.5833
Epoch 172/500
 - 0s - loss: 1.9128 - acc: 0.5833
Epoch 173/500
 - 0s - loss: 1.8995 - acc: 0.5833
Epoch 174/500
 - 0s - loss: 1.8862 - acc: 0.5833
Epoch 175/500
 - 0s - loss: 1.8729 - acc: 0.5833
Epoch 176/500
 - 0s - loss: 1.8596 - acc: 0.5833
Epoch 177/500
 - 0s - loss: 1.8464 - acc: 0.5833
Epoch 178/500
 - 0s - loss: 1.8332 - acc: 0.5833
Epoch 179/500
 - 0s - loss: 1.8200 - acc: 0.5833
Epoch 180/500
 - 0s - loss: 1.8069 - acc: 0.5833
Epoch 181/500
 - 0s - loss: 1.7937 - acc: 0.5833
Epoch 182/500
 - 0s - loss: 1.7806 - acc: 0.6250
Epoch 183/500
 - 0s - loss: 1.7676 - acc: 0.6250
Epoch 184/500
 - 0s - loss: 1.7545 - acc: 0.6250
Epoch 185/500
 - 0s - loss: 1.7415 - acc: 0.6250
Epoch 186/500
 - 0s - loss: 1.7285 - acc: 0.6250
Epoch 187/500
 - 0s - loss: 1.7155 - acc: 0.6667
Epoch 188/500
 - 0s - loss: 1.7025 - acc: 0.6667
Epoch 189/500
 - 0s - loss: 1.6896 - acc: 0.6667
Epoch 190/500
 - 0s - loss: 1.6767 - acc: 0.6667
Epoch 191/500
 - 0s 

 - 0s - loss: 0.4562 - acc: 0.8750
Epoch 339/500
 - 0s - loss: 0.4529 - acc: 0.8750
Epoch 340/500
 - 0s - loss: 0.4497 - acc: 0.8750
Epoch 341/500
 - 0s - loss: 0.4465 - acc: 0.8750
Epoch 342/500
 - 0s - loss: 0.4433 - acc: 0.8750
Epoch 343/500
 - 0s - loss: 0.4402 - acc: 0.8750
Epoch 344/500
 - 0s - loss: 0.4371 - acc: 0.8750
Epoch 345/500
 - 0s - loss: 0.4341 - acc: 0.8750
Epoch 346/500
 - 0s - loss: 0.4311 - acc: 0.8750
Epoch 347/500
 - 0s - loss: 0.4281 - acc: 0.8750
Epoch 348/500
 - 0s - loss: 0.4252 - acc: 0.8750
Epoch 349/500
 - 0s - loss: 0.4223 - acc: 0.8750
Epoch 350/500
 - 0s - loss: 0.4194 - acc: 0.8750
Epoch 351/500
 - 0s - loss: 0.4166 - acc: 0.8750
Epoch 352/500
 - 0s - loss: 0.4138 - acc: 0.8750
Epoch 353/500
 - 0s - loss: 0.4110 - acc: 0.8750
Epoch 354/500
 - 0s - loss: 0.4083 - acc: 0.8750
Epoch 355/500
 - 0s - loss: 0.4056 - acc: 0.8750
Epoch 356/500
 - 0s - loss: 0.4030 - acc: 0.8750
Epoch 357/500
 - 0s - loss: 0.4003 - acc: 0.8750
Epoch 358/500
 - 0s - loss: 0.3977

<keras.callbacks.History at 0x7f325028ef28>

In [27]:
# evaluate
in_text = 'Jack'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
	if index == yhat:
		print(word)

Jack
and
