In [1]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

Using TensorFlow backend.


In [40]:
book_file_path = 'book.txt'
text = open(book_file_path).read().lower()
tok = RegexpTokenizer(r'\w+')
words = tok.tokenize(text)

corpus length: 581887
109226
['into', 'his', 'own', 'delicate', 'and']


In [59]:
words = np.unique(words)
word_index = dict((c, i) for i, c in enumerate(words))


LENGTH = 5
previous_words = []
next_word = []
for i in range(len(words) - LENGTH):
#     print(words[i:i + WORD_LENGTH])
#     print(words[i + WORD_LENGTH])
    previous_words.append(words[i:i + LENGTH])
    next_word.append(words[i + LENGTH])



X = np.zeros((len(previous_words), LENGTH, len(words)), dtype=bool)
Y = np.zeros((len(next_word), len(words)), dtype=bool)
for i, list_of_prev_words in enumerate(previous_words):
    for j, word in enumerate(list_of_prev_words):
        X[i, j, word_index[word]] = True
    Y[i, word_index[next_word[i]]] = True

print(X[0][0])
print(len(X))

['the', 'adventures', 'of', 'sherlock', 'holmes']
by
[False False False ... False False False]
109221


In [60]:
model = Sequential()
model.add(LSTM(128, input_shape=(LENGTH, len(words))))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dense(len(words)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2,
                    shuffle=True).history

# model.save('model.h5')
# pickle.dump(history, open("history.pickle", "wb"))
model = load_model('model.h5')
history = pickle.load(open("history.pickle", "rb"))

Train on 103759 samples, validate on 5462 samples
Epoch 1/2
Epoch 2/2


In [8]:
def prepare_input(txt):
    x = np.zeros((1, LENGTH, len(words)))
    for t, w in enumerate(txt.split()):
        print(w)
        x[0, t, word_index[w]] = 1
    return x


def sample(predictions, top_n=3):
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions)
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)

    return heapq.nlargest(top_n, range(len(predictions)), predictions.take)


def predict(txt, n=3):
    if text == "":
        return "0"
    x = prepare_input(txt)
    predictions = model.predict(x, verbose=0)[0]
    next_indices = sample(predictions, n)
    return [words[idx] for idx in next_indices]


In [38]:
q = "I am still learning English, so please speak slowly"
# print("correct sentence: ", q)
seq = " ".join(tok.tokenize(q.lower())[1:6])

print("Sequence: ", seq)
print("next possible words: ", predict(seq, 7))

Sequence:  am still learning english so
am
still
learning
english
so
next possible words:  ['that', 'i', 'a', 'you', 'the', 'as', 'and']
