#### import libraries and modules

In [28]:
from numpy import array

In [29]:
from keras.preprocessing.text import Tokenizer

In [30]:
from keras.utils import to_categorical

In [31]:
from keras.preprocessing.sequence import pad_sequences

In [32]:
from keras.models import Sequential

In [33]:
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

#### generate a sequence from language model

In [34]:
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        yhat = model.predict_classes(encoded, verbose=0)
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text += '' + out_word
    return in_text

In [35]:
data = """Jack and Jill went up the hill
To fetch a pail of water
Jack fell down and broke his crown
And Jill came tumbling after
"""

In [36]:
data

'Jack and Jill went up the hill\nTo fetch a pail of water\nJack fell down and broke his crown\nAnd Jill came tumbling after\n'

In [37]:
tokenizer = Tokenizer()

In [38]:
tokenizer.fit_on_texts([data])

In [39]:
tokenizer.word_index

{'a': 10,
 'after': 21,
 'and': 1,
 'broke': 16,
 'came': 19,
 'crown': 18,
 'down': 15,
 'fell': 14,
 'fetch': 9,
 'hill': 7,
 'his': 17,
 'jack': 2,
 'jill': 3,
 'of': 12,
 'pail': 11,
 'the': 6,
 'to': 8,
 'tumbling': 20,
 'up': 5,
 'water': 13,
 'went': 4}

In [40]:
tokenizer.word_counts

OrderedDict([('jack', 2),
             ('and', 3),
             ('jill', 2),
             ('went', 1),
             ('up', 1),
             ('the', 1),
             ('hill', 1),
             ('to', 1),
             ('fetch', 1),
             ('a', 1),
             ('pail', 1),
             ('of', 1),
             ('water', 1),
             ('fell', 1),
             ('down', 1),
             ('broke', 1),
             ('his', 1),
             ('crown', 1),
             ('came', 1),
             ('tumbling', 1),
             ('after', 1)])

In [41]:
tokenizer.word_docs

{'a': 1,
 'after': 1,
 'and': 1,
 'broke': 1,
 'came': 1,
 'crown': 1,
 'down': 1,
 'fell': 1,
 'fetch': 1,
 'hill': 1,
 'his': 1,
 'jack': 1,
 'jill': 1,
 'of': 1,
 'pail': 1,
 'the': 1,
 'to': 1,
 'tumbling': 1,
 'up': 1,
 'water': 1,
 'went': 1}

In [42]:
tokenizer.texts_to_sequences([data])

[[2,
  1,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  2,
  14,
  15,
  1,
  16,
  17,
  18,
  1,
  3,
  19,
  20,
  21]]

In [43]:
encoded = tokenizer.texts_to_sequences([data])[0]

In [44]:
encoded

[2,
 1,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 2,
 14,
 15,
 1,
 16,
 17,
 18,
 1,
 3,
 19,
 20,
 21]

In [45]:
vocab_size = len(tokenizer.word_index) + 1

In [46]:
sequences = list()

In [47]:
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)

In [48]:
sequence

[19, 20, 21]

In [49]:
sequences

[[2, 1, 3],
 [1, 3, 4],
 [3, 4, 5],
 [4, 5, 6],
 [5, 6, 7],
 [6, 7, 8],
 [7, 8, 9],
 [8, 9, 10],
 [9, 10, 11],
 [10, 11, 12],
 [11, 12, 13],
 [12, 13, 2],
 [13, 2, 14],
 [2, 14, 15],
 [14, 15, 1],
 [15, 1, 16],
 [1, 16, 17],
 [16, 17, 18],
 [17, 18, 1],
 [18, 1, 3],
 [1, 3, 19],
 [3, 19, 20],
 [19, 20, 21]]

In [50]:
max_length = max([len(seq) for seq in sequences])

In [51]:
max_length

3

In [52]:
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')

In [53]:
sequences

array([[ 2,  1,  3],
       [ 1,  3,  4],
       [ 3,  4,  5],
       [ 4,  5,  6],
       [ 5,  6,  7],
       [ 6,  7,  8],
       [ 7,  8,  9],
       [ 8,  9, 10],
       [ 9, 10, 11],
       [10, 11, 12],
       [11, 12, 13],
       [12, 13,  2],
       [13,  2, 14],
       [ 2, 14, 15],
       [14, 15,  1],
       [15,  1, 16],
       [ 1, 16, 17],
       [16, 17, 18],
       [17, 18,  1],
       [18,  1,  3],
       [ 1,  3, 19],
       [ 3, 19, 20],
       [19, 20, 21]], dtype=int32)

In [54]:
sequences = array(sequences)

In [55]:
sequences

array([[ 2,  1,  3],
       [ 1,  3,  4],
       [ 3,  4,  5],
       [ 4,  5,  6],
       [ 5,  6,  7],
       [ 6,  7,  8],
       [ 7,  8,  9],
       [ 8,  9, 10],
       [ 9, 10, 11],
       [10, 11, 12],
       [11, 12, 13],
       [12, 13,  2],
       [13,  2, 14],
       [ 2, 14, 15],
       [14, 15,  1],
       [15,  1, 16],
       [ 1, 16, 17],
       [16, 17, 18],
       [17, 18,  1],
       [18,  1,  3],
       [ 1,  3, 19],
       [ 3, 19, 20],
       [19, 20, 21]], dtype=int32)

In [56]:
X, y = sequences[:,:-1], sequences[:,-1]

In [57]:
X

array([[ 2,  1],
       [ 1,  3],
       [ 3,  4],
       [ 4,  5],
       [ 5,  6],
       [ 6,  7],
       [ 7,  8],
       [ 8,  9],
       [ 9, 10],
       [10, 11],
       [11, 12],
       [12, 13],
       [13,  2],
       [ 2, 14],
       [14, 15],
       [15,  1],
       [ 1, 16],
       [16, 17],
       [17, 18],
       [18,  1],
       [ 1,  3],
       [ 3, 19],
       [19, 20]], dtype=int32)

In [58]:
y

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2, 14, 15,  1, 16, 17,
       18,  1,  3, 19, 20, 21], dtype=int32)

In [59]:
y = to_categorical(y, num_classes=vocab_size)

In [60]:
y

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

#### define the model

In [61]:
model = Sequential()

In [62]:
model.add(Embedding(vocab_size, 10, input_length=max_length-1))

In [63]:
model.add(LSTM(50))

In [64]:
? LSTM()

In [65]:
model.add(Dense(vocab_size, activation='softmax'))

In [66]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [67]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
model.fit(X,y, epochs=500, verbose=2)

Epoch 1/500
 - 1s - loss: 3.0906 - acc: 0.0435
Epoch 2/500
 - 0s - loss: 3.0898 - acc: 0.0435
Epoch 3/500
 - 0s - loss: 3.0889 - acc: 0.1739
Epoch 4/500
 - 0s - loss: 3.0881 - acc: 0.1304
Epoch 5/500
 - 0s - loss: 3.0873 - acc: 0.1304
Epoch 6/500
 - 0s - loss: 3.0864 - acc: 0.1304
Epoch 7/500
 - 0s - loss: 3.0855 - acc: 0.1304
Epoch 8/500
 - 0s - loss: 3.0846 - acc: 0.1304
Epoch 9/500
 - 0s - loss: 3.0837 - acc: 0.1304
Epoch 10/500
 - 0s - loss: 3.0828 - acc: 0.1304
Epoch 11/500
 - 0s - loss: 3.0819 - acc: 0.1304
Epoch 12/500
 - 0s - loss: 3.0809 - acc: 0.1304
Epoch 13/500
 - 0s - loss: 3.0800 - acc: 0.1304
Epoch 14/500
 - 0s - loss: 3.0789 - acc: 0.1304
Epoch 15/500
 - 0s - loss: 3.0779 - acc: 0.1304
Epoch 16/500
 - 0s - loss: 3.0769 - acc: 0.1304
Epoch 17/500
 - 0s - loss: 3.0758 - acc: 0.1304
Epoch 18/500
 - 0s - loss: 3.0746 - acc: 0.1304
Epoch 19/500
 - 0s - loss: 3.0735 - acc: 0.1304
Epoch 20/500
 - 0s - loss: 3.0723 - acc: 0.1304
Epoch 21/500
 - 0s - loss: 3.0711 - acc: 0.1304
E

Epoch 171/500
 - 0s - loss: 0.7249 - acc: 0.9565
Epoch 172/500
 - 0s - loss: 0.7043 - acc: 0.9565
Epoch 173/500
 - 0s - loss: 0.6841 - acc: 0.9565
Epoch 174/500
 - 0s - loss: 0.6643 - acc: 0.9565
Epoch 175/500
 - 0s - loss: 0.6449 - acc: 0.9565
Epoch 176/500
 - 0s - loss: 0.6258 - acc: 0.9565
Epoch 177/500
 - 0s - loss: 0.6072 - acc: 0.9565
Epoch 178/500
 - 0s - loss: 0.5889 - acc: 0.9565
Epoch 179/500
 - 0s - loss: 0.5711 - acc: 0.9565
Epoch 180/500
 - 0s - loss: 0.5537 - acc: 0.9565
Epoch 181/500
 - 0s - loss: 0.5368 - acc: 0.9565
Epoch 182/500
 - 0s - loss: 0.5202 - acc: 0.9565
Epoch 183/500
 - 0s - loss: 0.5041 - acc: 0.9565
Epoch 184/500
 - 0s - loss: 0.4884 - acc: 0.9565
Epoch 185/500
 - 0s - loss: 0.4732 - acc: 0.9565
Epoch 186/500
 - 0s - loss: 0.4583 - acc: 0.9565
Epoch 187/500
 - 0s - loss: 0.4440 - acc: 0.9565
Epoch 188/500
 - 0s - loss: 0.4300 - acc: 0.9565
Epoch 189/500
 - 0s - loss: 0.4165 - acc: 0.9565
Epoch 190/500
 - 0s - loss: 0.4035 - acc: 0.9565
Epoch 191/500
 - 0s 

 - 0s - loss: 0.0820 - acc: 0.9565
Epoch 339/500
 - 0s - loss: 0.0818 - acc: 0.9565
Epoch 340/500
 - 0s - loss: 0.0816 - acc: 0.9565
Epoch 341/500
 - 0s - loss: 0.0814 - acc: 0.9565
Epoch 342/500
 - 0s - loss: 0.0813 - acc: 0.9565
Epoch 343/500
 - 0s - loss: 0.0811 - acc: 0.9565
Epoch 344/500
 - 0s - loss: 0.0809 - acc: 0.9565
Epoch 345/500
 - 0s - loss: 0.0807 - acc: 0.9565
Epoch 346/500
 - 0s - loss: 0.0806 - acc: 0.9565
Epoch 347/500
 - 0s - loss: 0.0804 - acc: 0.9565
Epoch 348/500
 - 0s - loss: 0.0802 - acc: 0.9565
Epoch 349/500
 - 0s - loss: 0.0801 - acc: 0.9565
Epoch 350/500
 - 0s - loss: 0.0799 - acc: 0.9565
Epoch 351/500
 - 0s - loss: 0.0797 - acc: 0.9565
Epoch 352/500
 - 0s - loss: 0.0796 - acc: 0.9565
Epoch 353/500
 - 0s - loss: 0.0794 - acc: 0.9565
Epoch 354/500
 - 0s - loss: 0.0793 - acc: 0.9565
Epoch 355/500
 - 0s - loss: 0.0791 - acc: 0.9565
Epoch 356/500
 - 0s - loss: 0.0790 - acc: 0.9565
Epoch 357/500
 - 0s - loss: 0.0788 - acc: 0.9565
Epoch 358/500
 - 0s - loss: 0.0787

<keras.callbacks.History at 0x7fe458ea3dd8>

In [69]:
print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))

Jack andjilldowndowndowndown


In [70]:
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))

And Jillwentjilljill
