In [1]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Using TensorFlow backend.


In [2]:
# source text
data = """Jack and jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n"""

print(data)

Jack and jill went up the hill

To fetch a pail of water

Jack fell down and broke his crown

And Jill came tumbling after



In [3]:
[data]

['Jack and jill went up the hill\n\nTo fetch a pail of water\n\nJack fell down and broke his crown\n\nAnd Jill came tumbling after\n']

In [4]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
print("Length of encoded: %d" % len(encoded))
print(encoded)

Length of encoded: 25
[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]


In [5]:
# retrieve vocabulary size
vocabulary = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size : %d" % vocab_size)
print(vocabulary)

Vocabulary Size : 22
{'and': 1, 'jack': 2, 'jill': 3, 'went': 4, 'up': 5, 'the': 6, 'hill': 7, 'to': 8, 'fetch': 9, 'a': 10, 'pail': 11, 'of': 12, 'water': 13, 'fell': 14, 'down': 15, 'broke': 16, 'his': 17, 'crown': 18, 'came': 19, 'tumbling': 20, 'after': 21}


In [6]:
# encode 2 words -> 1 word
sequences = []
for i in range(2,len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)

print('Total Sequences:%d' % len(sequences))
print("Sequences:", sequences)

Total Sequences:23
Sequences: [[2, 1, 3], [1, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8], [7, 8, 9], [8, 9, 10], [9, 10, 11], [10, 11, 12], [11, 12, 13], [12, 13, 2], [13, 2, 14], [2, 14, 15], [14, 15, 1], [15, 1, 16], [1, 16, 17], [16, 17, 18], [17, 18, 1], [18, 1, 3], [1, 3, 19], [3, 19, 20], [19, 20, 21]]


In [7]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences,maxlen = max_length,padding = 'pre')
print("Max Sequence Length %d" % max_length)
print("Padded Sequences :", sequences)

Max Sequence Length 3
Padded Sequences : [[ 2  1  3]
 [ 1  3  4]
 [ 3  4  5]
 [ 4  5  6]
 [ 5  6  7]
 [ 6  7  8]
 [ 7  8  9]
 [ 8  9 10]
 [ 9 10 11]
 [10 11 12]
 [11 12 13]
 [12 13  2]
 [13  2 14]
 [ 2 14 15]
 [14 15  1]
 [15  1 16]
 [ 1 16 17]
 [16 17 18]
 [17 18  1]
 [18  1  3]
 [ 1  3 19]
 [ 3 19 20]
 [19 20 21]]


In [8]:
# split into input and output elements
sequences = array(sequences)
sequences

array([[ 2,  1,  3],
       [ 1,  3,  4],
       [ 3,  4,  5],
       [ 4,  5,  6],
       [ 5,  6,  7],
       [ 6,  7,  8],
       [ 7,  8,  9],
       [ 8,  9, 10],
       [ 9, 10, 11],
       [10, 11, 12],
       [11, 12, 13],
       [12, 13,  2],
       [13,  2, 14],
       [ 2, 14, 15],
       [14, 15,  1],
       [15,  1, 16],
       [ 1, 16, 17],
       [16, 17, 18],
       [17, 18,  1],
       [18,  1,  3],
       [ 1,  3, 19],
       [ 3, 19, 20],
       [19, 20, 21]])

In [9]:
X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y,num_classes = vocab_size)

In [10]:
def define_model(vocab_size,max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,10,input_length = max_length-1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation = 'softmax'))
    # compile network
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.summary()
    return model
# define model
model = define_model(vocab_size,max_length)
# fit network
model.fit(X,y, epochs = 500, verbose = 2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________






Epoch 1/500
 - 1s - loss: 3.0910 - accuracy: 0.0870
Epoch 2/500
 - 0s - loss: 3.0901 - accuracy: 0.1304
Epoch 3/500
 - 0s - loss: 3.0892 - accuracy: 0.0870
Epoch 4/500
 - 0s - loss: 3.0883 - accuracy: 0.1304
Epoch 5/500
 - 0s - loss: 3.0874 - accuracy: 0.1304
Epoch 6/500
 - 0s - loss: 3.0865 - accuracy: 0.1304
Epoch 7/500
 - 0s - loss: 3.0856 - accuracy: 0.1304
Epoch 8/500
 - 0s - loss: 3.0847 - accuracy: 0.1304
Epoch 9/500
 - 0s - loss: 3.0838 - accuracy: 0.1304
Epoch 10/500
 - 0s - loss: 3.0828 - accuracy: 0.1304
Epoch 11/500
 - 0s - loss: 3.0818 - accuracy: 0.1304
Epoch 12/500
 - 0s - loss: 3.0809 - accuracy: 0.1304
Epoch 13/500
 - 0s - loss: 3.0798 - accuracy: 0.1739
Epoch 14/500
 - 0s - loss: 3.0788 - accuracy: 0.1739
Epoch 15/500
 - 0s - loss: 3.0777 - accuracy: 0.1739
Epoch 16/500
 - 0s - loss: 3.0766 - accuracy: 0.1739
Epoch 17/500
 - 0s - loss: 3.0755 - accuracy: 0.1739
Epoch 18/500
 - 0s - loss: 3.0743 - accuracy: 0.1739
Epoch 19/500
 - 0s - loss: 3.0731 - accuracy: 0.1739
Ep

Epoch 155/500
 - 0s - loss: 1.2262 - accuracy: 0.7826
Epoch 156/500
 - 0s - loss: 1.2086 - accuracy: 0.7826
Epoch 157/500
 - 0s - loss: 1.1910 - accuracy: 0.8696
Epoch 158/500
 - 0s - loss: 1.1735 - accuracy: 0.8696
Epoch 159/500
 - 0s - loss: 1.1560 - accuracy: 0.9130
Epoch 160/500
 - 0s - loss: 1.1387 - accuracy: 0.9130
Epoch 161/500
 - 0s - loss: 1.1215 - accuracy: 0.9130
Epoch 162/500
 - 0s - loss: 1.1043 - accuracy: 0.9130
Epoch 163/500
 - 0s - loss: 1.0873 - accuracy: 0.9130
Epoch 164/500
 - 0s - loss: 1.0703 - accuracy: 0.9130
Epoch 165/500
 - 0s - loss: 1.0534 - accuracy: 0.9565
Epoch 166/500
 - 0s - loss: 1.0367 - accuracy: 0.9565
Epoch 167/500
 - 0s - loss: 1.0201 - accuracy: 0.9565
Epoch 168/500
 - 0s - loss: 1.0035 - accuracy: 0.9565
Epoch 169/500
 - 0s - loss: 0.9871 - accuracy: 0.9565
Epoch 170/500
 - 0s - loss: 0.9708 - accuracy: 0.9565
Epoch 171/500
 - 0s - loss: 0.9546 - accuracy: 0.9565
Epoch 172/500
 - 0s - loss: 0.9386 - accuracy: 0.9565
Epoch 173/500
 - 0s - loss: 

Epoch 307/500
 - 0s - loss: 0.1162 - accuracy: 0.9565
Epoch 308/500
 - 0s - loss: 0.1155 - accuracy: 0.9565
Epoch 309/500
 - 0s - loss: 0.1148 - accuracy: 0.9565
Epoch 310/500
 - 0s - loss: 0.1141 - accuracy: 0.9565
Epoch 311/500
 - 0s - loss: 0.1134 - accuracy: 0.9565
Epoch 312/500
 - 0s - loss: 0.1127 - accuracy: 0.9565
Epoch 313/500
 - 0s - loss: 0.1120 - accuracy: 0.9565
Epoch 314/500
 - 0s - loss: 0.1114 - accuracy: 0.9565
Epoch 315/500
 - 0s - loss: 0.1108 - accuracy: 0.9565
Epoch 316/500
 - 0s - loss: 0.1101 - accuracy: 0.9565
Epoch 317/500
 - 0s - loss: 0.1095 - accuracy: 0.9565
Epoch 318/500
 - 0s - loss: 0.1089 - accuracy: 0.9565
Epoch 319/500
 - 0s - loss: 0.1083 - accuracy: 0.9565
Epoch 320/500
 - 0s - loss: 0.1077 - accuracy: 0.9565
Epoch 321/500
 - 0s - loss: 0.1072 - accuracy: 0.9565
Epoch 322/500
 - 0s - loss: 0.1066 - accuracy: 0.9565
Epoch 323/500
 - 0s - loss: 0.1061 - accuracy: 0.9565
Epoch 324/500
 - 0s - loss: 0.1055 - accuracy: 0.9565
Epoch 325/500
 - 0s - loss: 

Epoch 459/500
 - 0s - loss: 0.0752 - accuracy: 0.9565
Epoch 460/500
 - 0s - loss: 0.0751 - accuracy: 0.9565
Epoch 461/500
 - 0s - loss: 0.0750 - accuracy: 0.9565
Epoch 462/500
 - 0s - loss: 0.0749 - accuracy: 0.9565
Epoch 463/500
 - 0s - loss: 0.0748 - accuracy: 0.9565
Epoch 464/500
 - 0s - loss: 0.0748 - accuracy: 0.9565
Epoch 465/500
 - 0s - loss: 0.0747 - accuracy: 0.9565
Epoch 466/500
 - 0s - loss: 0.0746 - accuracy: 0.9565
Epoch 467/500
 - 0s - loss: 0.0745 - accuracy: 0.9565
Epoch 468/500
 - 0s - loss: 0.0744 - accuracy: 0.9565
Epoch 469/500
 - 0s - loss: 0.0743 - accuracy: 0.9565
Epoch 470/500
 - 0s - loss: 0.0743 - accuracy: 0.9565
Epoch 471/500
 - 0s - loss: 0.0742 - accuracy: 0.9565
Epoch 472/500
 - 0s - loss: 0.0741 - accuracy: 0.9565
Epoch 473/500
 - 0s - loss: 0.0740 - accuracy: 0.9565
Epoch 474/500
 - 0s - loss: 0.0739 - accuracy: 0.9565
Epoch 475/500
 - 0s - loss: 0.0739 - accuracy: 0.9565
Epoch 476/500
 - 0s - loss: 0.0738 - accuracy: 0.9565
Epoch 477/500
 - 0s - loss: 

<keras.callbacks.callbacks.History at 0x23c3725f788>

In [11]:
# generate a sequence from a language model
def generate_seq(model,tokenizer,max_length,seed_text,n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        print('Text To Sequence:',encoded)
        # pre_pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen = max_length, padding = 'pre')
        print('Padded Encoding :', encoded)
        # predict probabilities for each word
        yhat = model.predict_classes(encoded,verbose = 0)
        print('Prediction:',yhat)
        # map predicted word index to word
        out_word = ''
        
        for word,index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text = in_text + ' ' + out_word
        print('Predicted Word:', out_word)
        print('Input Text:',in_text)
    return in_text

# evaluate model
print(generate_seq(model,tokenizer,max_length-1,'Jack and',5))

Text To Sequence: [2, 1]
Padded Encoding : [[2 1]]
Prediction: [3]
Predicted Word: jill
Input Text: Jack and jill
Text To Sequence: [2, 1, 3]
Padded Encoding : [[1 3]]
Prediction: [19]
Predicted Word: came
Input Text: Jack and jill came
Text To Sequence: [2, 1, 3, 19]
Padded Encoding : [[ 3 19]]
Prediction: [20]
Predicted Word: tumbling
Input Text: Jack and jill came tumbling
Text To Sequence: [2, 1, 3, 19, 20]
Padded Encoding : [[19 20]]
Prediction: [21]
Predicted Word: after
Input Text: Jack and jill came tumbling after
Text To Sequence: [2, 1, 3, 19, 20, 21]
Padded Encoding : [[20 21]]
Prediction: [21]
Predicted Word: after
Input Text: Jack and jill came tumbling after after
Jack and jill came tumbling after after


In [12]:
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))

Text To Sequence: [1, 3]
Padded Encoding : [[1 3]]
Prediction: [19]
Predicted Word: came
Input Text: And Jill came
Text To Sequence: [1, 3, 19]
Padded Encoding : [[ 3 19]]
Prediction: [20]
Predicted Word: tumbling
Input Text: And Jill came tumbling
Text To Sequence: [1, 3, 19, 20]
Padded Encoding : [[19 20]]
Prediction: [21]
Predicted Word: after
Input Text: And Jill came tumbling after
And Jill came tumbling after


In [13]:
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))

Text To Sequence: [14, 15]
Padded Encoding : [[14 15]]
Prediction: [1]
Predicted Word: and
Input Text: fell down and
Text To Sequence: [14, 15, 1]
Padded Encoding : [[15  1]]
Prediction: [16]
Predicted Word: broke
Input Text: fell down and broke
Text To Sequence: [14, 15, 1, 16]
Padded Encoding : [[ 1 16]]
Prediction: [17]
Predicted Word: his
Input Text: fell down and broke his
Text To Sequence: [14, 15, 1, 16, 17]
Padded Encoding : [[16 17]]
Prediction: [18]
Predicted Word: crown
Input Text: fell down and broke his crown
Text To Sequence: [14, 15, 1, 16, 17, 18]
Padded Encoding : [[17 18]]
Prediction: [1]
Predicted Word: and
Input Text: fell down and broke his crown and
fell down and broke his crown and


In [14]:
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))

Text To Sequence: [11, 12]
Padded Encoding : [[11 12]]
Prediction: [13]
Predicted Word: water
Input Text: pail of water
Text To Sequence: [11, 12, 13]
Padded Encoding : [[12 13]]
Prediction: [2]
Predicted Word: jack
Input Text: pail of water jack
Text To Sequence: [11, 12, 13, 2]
Padded Encoding : [[13  2]]
Prediction: [14]
Predicted Word: fell
Input Text: pail of water jack fell
Text To Sequence: [11, 12, 13, 2, 14]
Padded Encoding : [[ 2 14]]
Prediction: [15]
Predicted Word: down
Input Text: pail of water jack fell down
Text To Sequence: [11, 12, 13, 2, 14, 15]
Padded Encoding : [[14 15]]
Prediction: [1]
Predicted Word: and
Input Text: pail of water jack fell down and
pail of water jack fell down and
