# Next Word Prediction:

### Importing The Required Libraries:

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [7]:
"""
    Dataset: http://www.gutenberg.org/cache/epub/5200/pg5200.txt
    Remove all the unnecessary data and label it as Metamorphosis-clean.
    The starting and ending lines should be as follows.

"""


file = open("../data/praire-clean.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  I opened my eyes and saw a pea-green world all around me. Then I heard

The Last Line:  just what had kept him away so long!


In [8]:
#lines=lines[:1500]

### Cleaning the data:

In [9]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
#data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

'I opened my eyes and saw a pea-green world all around me. Then I heard\n the doctor say: "Give \'er another whiff or two." His voice sounded\n far-away, as though he were speaking through the Simplon Tunnel, and\n not merely through his teeth, within twelve inches of my nose.\n \n I took my whiff or two. I gulped at that chloroform like a thirsty\n Bedouin at a wad'

In [10]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'I opened my eyes and saw a pea green world all around me  Then I heard\n the doctor say   Give  er another whiff or two   His voice sounded\n far away  as though he were speaking through the Simplon Tunnel  and\n not merely through his teeth  within twelve inches of my nose \n \n I took my whiff or two  I gulped at that chloroform like a thirsty\n Bedouin at a wadi spring  I went down into the pea green emptiness\n again  and forgot about the Kelly pad and the recurring waves of pain\n that came bigger '

In [11]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

'I opened my eyes and saw a pea-green world all around me. Then heard the doctor say: "Give \'er another whiff or two." His voice sounded far-away, as though he were speaking through Simplon Tunnel, not merely his teeth, within twelve inches of nose. took two. gulped at that chloroform like thirsty Bedouin wadi-spring. went down into emptiness again, forgot about Kelly pad recurring waves pain came bigger tried to sweep racked old body breakers ribs stranded schooner. hateful metallic clink steel '

### Tokenization:

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('../model/tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[31, 2618, 403, 404, 1, 657, 24, 2619, 87, 88]

In [13]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8595


In [14]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  14737


array([[  31, 2618],
       [2618,  403],
       [ 403,  404],
       [ 404,    1],
       [   1,  657],
       [ 657,   24],
       [  24, 2619],
       [2619,   87],
       [  87,   88],
       [  88,  146]])

In [15]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [16]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [  31 2618  403  404    1]
The responses are:  [2618  403  404    1  657]


In [17]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Creating the Model:

In [18]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

2022-12-07 16:39:23.343625: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             85950     
                                                                 
 lstm (LSTM)                 (None, 1, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 8595)              8603595   
                                                                 
Total params: 21,738,545
Trainable params: 21,738,545
Non-trainable params: 0
_________________________________________________________________


### Plot The Model:

In [20]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


### Callbacks:

In [21]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("../model/nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

### Compile The Model:

In [22]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001),metrics=['accuracy'])



### Fit The Model:

In [None]:
model.fit(X, y, epochs=150, batch_size=1000, callbacks=[checkpoint, reduce, tensorboard_Visualization])

In [25]:
# serialize model to JSON
from tensorflow.keras.models import Sequential, model_from_json
model_json = model.to_json()
with open("../model/nextword.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../model/nextword.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('../model/nextword.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("../model/nextword.h5")
print("Loaded model from disk")

Saved model to disk
Loaded model from disk
