In [31]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

In [84]:
text = open('xab').read().lower() #only 2MB
print('corpus length:', len(text))

corpus length: 2000000


In [85]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

In [86]:
unique_words = list(set(words))
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

In [99]:
word_window = 5 # Number of words to use to predict the next one  
sentences = [] # training data
next_words = [] # training labels
for i in range(len(words) - word_window):
    sentences.append(words[i:i + word_window])
    next_words.append(words[i + word_window])

In [88]:
X = np.zeros((len(sentences), word_window, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(sentences):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

In [32]:
model = Sequential()
model.add(LSTM(128, input_shape=(word_window, len(unique_words)),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=128, return_sequences=False))
model.add(Dense(len(unique_words), activation='softmax'))

In [33]:
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01), metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=5, shuffle=True).history

Train on 303937 samples, validate on 15997 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
model.save('model1.h5')
pickle.dump(history, open("history_1.p", "wb"))

In [37]:
model = load_model('model1.h5')
history = pickle.load(open("history_1.p", "rb"))

In [89]:
def prepare_input(text):
    x = np.zeros((1, word_window, len(unique_words)))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x

In [90]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [91]:
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

In [92]:
def get_predictions(text):
    q = text
    seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
    print("Next Possible Words: ", predict_completions(seq, 5))

In [93]:
get_predictions("Thank you for letting me")

thank
you
for
letting
me
Next Possible Words:  ['if', 'you', 'and', 'for', 'in']


In [94]:
get_predictions("If you have any questions")

if
you
have
any
questions
Next Possible Words:  ['or', 'please', 'messagefrom', 'if', 'thank']


In [95]:
get_predictions("Hope we can connect with")

hope
we
can
connect
with
Next Possible Words:  ['a', 'you', 'original', 'email', 'your']


In [96]:
get_predictions("Let us schedule a")

let
us
schedule
a
Next Possible Words:  ['of', 'and', 'party', 'you', 'game']


In [97]:
get_predictions("Can you please send us")

can
you
please
send
us
Next Possible Words:  ['to', 'me', 'out', 'your', 'a']


In [98]:
get_predictions("What do you think about")

what
do
you
think
about
Next Possible Words:  ['you', 'the', 'beth', 'my', 'i']
