# Next Word Prediction:

### Importing The Required Libraries:

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

2022-12-09 02:13:19.258809: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
file = open("../data/praire-clean.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)

### Cleaning the data:

In [3]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')

In [4]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'I opened my eyes and saw a pea green world all around me  Then I heard the doctor say   Give  er another whiff or two   His voice sounded far away  as though he were speaking through the Simplon Tunnel  and not merely through his teeth  within twelve inches of my nose   I took my whiff or two  I gulped at that chloroform like a thirsty Bedouin at a wadi spring  I went down into the pea green emptiness again  and forgot about the Kelly pad and the recurring waves of pain that came bigger and bigg'

In [5]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

'I opened my eyes and saw a pea-green world all around me. Then heard the doctor say: "Give \'er another whiff or two." His voice sounded far-away, as though he were speaking through Simplon Tunnel, not merely his teeth, within twelve inches of nose. took two. gulped at that chloroform like thirsty Bedouin wadi-spring. went down into emptiness again, forgot about Kelly pad recurring waves pain came bigger tried to sweep racked old body breakers ribs stranded schooner. hateful metallic clink steel '

### Tokenization:

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('../model/tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[31, 2618, 403, 404, 1, 657, 24, 2619, 87, 88]

In [8]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8595


In [9]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)

In [10]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [11]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [  31 2618  403  404    1]
The responses are:  [2618  403  404    1  657]


In [None]:
y = to_categorical(y, num_classes=vocab_size)

### Creating the Model:

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [None]:
model.summary()

### Callbacks:

In [21]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("../model/nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

### Compile The Model:

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001),metrics=['accuracy'])

### Fit The Model:

In [None]:
model.fit(X, y, epochs=150, batch_size=1000, callbacks=[checkpoint, reduce, tensorboard_Visualization])

In [25]:
# serialize model to JSON
from tensorflow.keras.models import Sequential, model_from_json
model_json = model.to_json()
with open("../model/nextword.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../model/nextword.h5")

print("Saved model to disk")

Saved model to disk
Loaded model from disk


### CODE REFERENCE: 
https://www.kaggle.com/code/ysthehurricane/next-word-prediction-bi-lstm-tutorial-easy-way <br>
https://www.analyticsvidhya.com/blog/2021/08/predict-the-next-word-of-your-text-using-long-short-term-memory-lstm/ <br>
https://github.com/Bharath-K3/Next-Word-Prediction-with-NLP-and-Deep-Learning <br>