Objective

Using Tensorflow and Keras library, the model has train using RNN algorithm, to predict the next word.

### Installing the Libraries

In [None]:
!pip install numpy
!pip install tensorflow
!pip install keras
!pip install nltk



### Importing Libraries

In [None]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

### Loading Dataset

In [None]:
path = '1661-0.txt'
text = open(path).read().lower()
print('corpus length:', len(text))


corpus length: 584553


### Splitting the entire dataset into each word in order without the presence of special characters

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)


In [None]:
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

### Feature engineering

In [None]:
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

['project', 'gutenberg', 's', 'the', 'adventures']
of


### One-Hot encoding

In [None]:
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

In [None]:
print(X[0][0])


[False False False ... False False False]


### Building the model


In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

### Training


In [None]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=10, shuffle=True).history



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save('keras_next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
model = load_model('keras_next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

In [None]:
history


{'loss': [6.702389717102051,
  6.480857849121094,
  6.4400129318237305,
  6.377123832702637,
  6.292407035827637,
  6.202435493469238,
  6.126860618591309,
  6.066046237945557,
  6.013283729553223,
  5.967902183532715],
 'accuracy': [0.05287252366542816,
  0.053142376244068146,
  0.05357607454061508,
  0.06236567348241806,
  0.07102034240961075,
  0.08064842224121094,
  0.08869592100381851,
  0.09303289651870728,
  0.09784211218357086,
  0.10199597477912903],
 'val_loss': [7.280484676361084,
  7.213881015777588,
  7.185855865478516,
  7.131904125213623,
  7.114859104156494,
  7.035543918609619,
  6.996490001678467,
  6.955252647399902,
  6.941208839416504,
  6.907465934753418],
 'val_accuracy': [0.05382643640041351,
  0.05382643640041351,
  0.053643353283405304,
  0.06829000264406204,
  0.06902233511209488,
  0.07305016368627548,
  0.0774441584944725,
  0.07781032472848892,
  0.08073965460062027,
  0.08311973512172699]}

### Evaluation


In [None]:
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

<matplotlib.legend.Legend at 0x7d8e34a677f0>

In [None]:

plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

<matplotlib.legend.Legend at 0x7d8e34ed4c40>

### Prediction


In [None]:
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words)))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x
prepare_input("It is not a lack".lower())

it
is
not
a
lack


array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [None]:

def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [None]:
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

In [None]:
q =  "There is nothing more deceptive than an obvious fact"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))


correct sentence:  There is nothing more deceptive than an obvious fact
Sequence:  there is nothing more deceptive
there
is
nothing
more
deceptive
next possible words:  ['to', 'and', 'i', 'the', 'in']
