In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq
import regex
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Load data
path = "/content/drive/MyDrive/1661-0.txt"
text = open(path).read().lower()

In [None]:
#Data Preprocessing
character = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(character))
indices_char = dict((i, c) for i, c in enumerate(character))
 
print(f'unique chars: {len(character)}')

unique chars: 73


In [None]:
#chunk 40 characters with 3 sequences
seq_len = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - seq_len, step):
    sentences.append(text[i: i + seq_len ])
    next_chars.append(text[i + seq_len])
print(f'num training examples: {len(sentences)}')

num training examples: 193950


In [None]:
# generating our features and labels
#one hot encoding
X = np.zeros((len(sentences), seq_len, len(character)), dtype=np.bool)
y = np.zeros((len(sentences), len(character)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

sentences[124]
next_chars[100]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.zeros((len(sentences), seq_len, len(character)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(character)), dtype=np.bool)


'e'

In [None]:
#one hot encoded data
X[0][0]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True])

In [None]:
X.shape

(193950, 40, 73)

In [None]:
print(X[0][0])

[ True False False ... False False False]


In [None]:
# Creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(seq_len, len(character))))
model.add(Dense(len(character)))
model.add(Activation('softmax'))
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_13 (LSTM)              (None, 256)               337920    
                                                                 
 dense_9 (Dense)             (None, 73)                18761     
                                                                 
 activation_9 (Activation)   (None, 73)                0         
                                                                 
Total params: 356,681
Trainable params: 356,681
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training our model
optimizer = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=25,monitor='val_loss',restore_best_weights=True)
history = model.fit(X, y,
                    validation_split=0.01, 
                    batch_size=128,
                    epochs=50, 
                    shuffle=True,callbacks=[early_stopping_cb]).history

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50


In [None]:
#Model Evaluation
loss_and_acc=model.evaluate(X,y)
print("Test Loss", loss_and_acc[0])
print("Test Accuracy", loss_and_acc[1])

Test Loss 1.2740947008132935
Test Accuracy 0.6089301109313965


In [None]:
def prepare_input(text):
    x = np.zeros((1, seq_len, len(character)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1.
        
    return x

In [None]:
#functions to get next probable characters
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [None]:
def predict_completion(text):
    original_text = text
    generated = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]
        text = text[1:] + next_char
        completion += next_char
        
        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]            

In [None]:
#Prediction
quotes = [   
    "it is nothing but correct prediction by the model and got average accuracy.",
    "They work well for natural language processing tasks such as next word predi.",
    "The sequences are then padded to ensure they have the same length.",
    "After training, the accuracy of the model on the training data can be evaluated.",
    "Model was trained with long short term memory rnn."  
    
]

In [None]:
model.save("/content/drive/MyDrive/nextword.h5")

In [None]:
model = load_model("/content/drive/MyDrive/nextword.h5")

In [None]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_13 (LSTM)              (None, 256)               337920    
                                                                 
 dense_9 (Dense)             (None, 73)                18761     
                                                                 
 activation_9 (Activation)   (None, 73)                0         
                                                                 
Total params: 356,681
Trainable params: 356,681
Non-trainable params: 0
_________________________________________________________________


In [None]:
for q in quotes:
    seq = q[:40].lower()
    print(seq)
    print(predict_completions(seq, 5))
    print()

it is nothing but correct prediction by 
['the ', 'a ', 'some ', 'my ', 'his ']

they work well for natural language proc
['ised ', 'essible ', 'less ', 'onted ', 'ate ']

the sequences are then padded to ensure 
['the ', 'and ', 'which ', 'of ', 'in ']

after training, the accuracy of the mode
[' that ', 'n ', 'd ', '. ', 'r ']

model was trained with long short term m
['an ', 'e ', 'ore ', 'inutes ', 'y ']

