# Next Word Prediction:

### Importing The Required Libraries:

In [1]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:

sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))


In [3]:
file = open("corpus-zare.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  فصل اول: مقدمه

The Last Line:  



### Cleaning the data:

In [4]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

'فصل اول: مقدمه پیشرفت هوش مصنوعی در زمینه\u200cهای مختلف در سال\u200cهای اخیر باعث شده تا آحاد مردم نگاه ویژه\u200cای به آن داشته باشند و مطالبه این موضوع که مسایل مختلفی را با استفاده از الگوریتم\u200cهای هوش مصنوعی  بتوانند حل کنند مورد دور از ذهنی به شمار نمی\u200cآید. بسیاری از این مسایل مواردی هستند که در گذشته ما وجود داشته\u200cاند و در زمان خود به دلیل کمبود امکانات، فقدان تکنولو'

In [5]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'فصل اول  مقدمه پیشرفت هوش مصنوعی در زمینه\u200cهای مختلف در سال\u200cهای اخیر باعث شده تا آحاد مردم نگاه ویژه\u200cای به آن داشته باشند و مطالبه این موضوع که مسایل مختلفی را با استفاده از الگوریتم\u200cهای هوش مصنوعی  بتوانند حل کنند مورد دور از ذهنی به شمار نمی\u200cآید  بسیاری از این مسایل مواردی هستند که در گذشته ما وجود داشته\u200cاند و در زمان خود به دلیل کمبود امکانات، فقدان تکنولوژی\u200cهای لازم یا نبودن شرایط، این مسایل حل نشده\u200cاند که به صورت یک موضوع قابل بحث در دنیا مطرح می\u200cباشند  یکی از این مسایل که در حوزه هنر و تکنو'

In [6]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

'فصل اول: مقدمه پیشرفت هوش مصنوعی در زمینه\u200cهای مختلف سال\u200cهای اخیر باعث شده تا آحاد مردم نگاه ویژه\u200cای به آن داشته باشند و مطالبه این موضوع که مسایل مختلفی را با استفاده از الگوریتم\u200cهای بتوانند حل کنند مورد دور ذهنی شمار نمی\u200cآید. بسیاری مواردی هستند گذشته ما وجود داشته\u200cاند زمان خود دلیل کمبود امکانات، فقدان تکنولوژی\u200cهای لازم یا نبودن شرایط، نشده\u200cاند صورت یک قابل بحث دنیا مطرح می\u200cباشند. یکی حوزه هنر تکنولوژی دارد مربوط زمانی است اولین دوربین\u200cهای عکاسی اختراع کارگیری شدند، امکان ثبت تصاویر رنگی نداشت'

In [7]:
file1 = open("corpus-zare-cleaned.txt","w", encoding = "utf8")
file1.write(data)
file1.close() #to change file access modes

### Tokenization:

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1-zare.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[138, 45, 139, 140, 141, 142, 143, 144, 145, 146]

In [18]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

1874


In [19]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  2220


array([[138,  45],
       [ 45, 139],
       [139, 140],
       [140, 141],
       [141, 142],
       [142, 143],
       [143, 144],
       [144, 145],
       [145, 146],
       [146, 147]])

In [20]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [21]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [138  45 139 140 141]
The responses are:  [ 45 139 140 141 142]


In [22]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Creating the Model:

In [23]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             18740     
_________________________________________________________________
lstm_2 (LSTM)                (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_3 (Dense)              (None, 1874)              1875874   
Total params: 14,943,614
Trainable params: 14,943,614
Non-trainable params: 0
_________________________________________________________________


### Plot The Model:

In [25]:

from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

ImportError: cannot import name 'get_config' from 'tensorflow.python.eager.context' (C:\Users\Platinum\ana3\envs\arioo5\lib\site-packages\tensorflow\python\eager\context.py)

### Callbacks:

In [26]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword-zare.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword2'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

### Compile The Model:

In [27]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

### Fit The Model:

In [28]:
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/150
Epoch 00001: loss improved from inf to 7.54099, saving model to nextword-zare.h5
Epoch 2/150
Epoch 00002: loss improved from 7.54099 to 7.53300, saving model to nextword-zare.h5
Epoch 3/150
Epoch 00003: loss improved from 7.53300 to 7.45608, saving model to nextword-zare.h5
Epoch 4/150
Epoch 00004: loss improved from 7.45608 to 7.29967, saving model to nextword-zare.h5
Epoch 5/150
Epoch 00005: loss improved from 7.29967 to 7.03756, saving model to nextword-zare.h5
Epoch 6/150
Epoch 00006: loss improved from 7.03756 to 6.85536, saving model to nextword-zare.h5
Epoch 7/150
Epoch 00007: loss improved from 6.85536 to 6.72362, saving model to nextword-zare.h5
Epoch 8/150
Epoch 00008: loss improved from 6.72362 to 6.64347, saving model to nextword-zare.h5
Epoch 9/150
Epoch 00009: loss improved from 6.64347 to 6.56849, saving model to nextword-zare.h5
Epoch 10/150
Epoch 00010: loss improv

Epoch 34/150
Epoch 00034: loss improved from 5.42356 to 5.41915, saving model to nextword-zare.h5
Epoch 35/150
Epoch 00035: loss improved from 5.41915 to 5.39171, saving model to nextword-zare.h5
Epoch 36/150
Epoch 00036: loss improved from 5.39171 to 5.34892, saving model to nextword-zare.h5
Epoch 37/150
Epoch 00037: loss improved from 5.34892 to 5.29960, saving model to nextword-zare.h5
Epoch 38/150
Epoch 00038: loss improved from 5.29960 to 5.29228, saving model to nextword-zare.h5
Epoch 39/150
Epoch 00039: loss improved from 5.29228 to 5.22899, saving model to nextword-zare.h5
Epoch 40/150
Epoch 00040: loss improved from 5.22899 to 5.16769, saving model to nextword-zare.h5
Epoch 41/150
Epoch 00041: loss improved from 5.16769 to 5.05678, saving model to nextword-zare.h5
Epoch 42/150
Epoch 00042: loss improved from 5.05678 to 4.91307, saving model to nextword-zare.h5
Epoch 43/150
Epoch 00043: loss improved from 4.91307 to 4.77030, saving model to nextword-zare.h5
Epoch 44/150
Epoch 0

Epoch 68/150
Epoch 00068: loss improved from 3.43050 to 3.38864, saving model to nextword-zare.h5
Epoch 69/150
Epoch 00069: loss improved from 3.38864 to 3.33205, saving model to nextword-zare.h5
Epoch 70/150
Epoch 00070: loss improved from 3.33205 to 3.31985, saving model to nextword-zare.h5
Epoch 71/150
Epoch 00071: loss improved from 3.31985 to 3.28571, saving model to nextword-zare.h5
Epoch 72/150
Epoch 00072: loss improved from 3.28571 to 3.26632, saving model to nextword-zare.h5
Epoch 73/150
Epoch 00073: loss did not improve from 3.26632
Epoch 74/150
Epoch 00074: loss did not improve from 3.26632
Epoch 75/150
Epoch 00075: loss improved from 3.26632 to 3.23522, saving model to nextword-zare.h5
Epoch 76/150
Epoch 00076: loss improved from 3.23522 to 3.13263, saving model to nextword-zare.h5
Epoch 77/150
Epoch 00077: loss improved from 3.13263 to 3.10061, saving model to nextword-zare.h5
Epoch 78/150
Epoch 00078: loss improved from 3.10061 to 3.09933, saving model to nextword-zare.h

Epoch 00103: loss improved from 2.31891 to 2.26465, saving model to nextword-zare.h5
Epoch 104/150
Epoch 00104: loss improved from 2.26465 to 2.21035, saving model to nextword-zare.h5
Epoch 105/150
Epoch 00105: loss improved from 2.21035 to 2.19355, saving model to nextword-zare.h5
Epoch 106/150
Epoch 00106: loss improved from 2.19355 to 2.15361, saving model to nextword-zare.h5
Epoch 107/150
Epoch 00107: loss improved from 2.15361 to 2.09294, saving model to nextword-zare.h5
Epoch 108/150
Epoch 00108: loss improved from 2.09294 to 2.07204, saving model to nextword-zare.h5
Epoch 109/150
Epoch 00109: loss improved from 2.07204 to 2.02086, saving model to nextword-zare.h5
Epoch 110/150
Epoch 00110: loss improved from 2.02086 to 2.01275, saving model to nextword-zare.h5
Epoch 111/150
Epoch 00111: loss improved from 2.01275 to 1.99065, saving model to nextword-zare.h5
Epoch 112/150
Epoch 00112: loss improved from 1.99065 to 1.88313, saving model to nextword-zare.h5
Epoch 113/150
Epoch 0011

Epoch 138/150
Epoch 00138: loss did not improve from 1.22490
Epoch 139/150
Epoch 00139: loss did not improve from 1.22490

Epoch 00139: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 140/150
Epoch 00140: loss improved from 1.22490 to 0.91718, saving model to nextword-zare.h5
Epoch 141/150
Epoch 00141: loss improved from 0.91718 to 0.72323, saving model to nextword-zare.h5
Epoch 142/150
Epoch 00142: loss improved from 0.72323 to 0.65469, saving model to nextword-zare.h5
Epoch 143/150
Epoch 00143: loss improved from 0.65469 to 0.62200, saving model to nextword-zare.h5
Epoch 144/150
Epoch 00144: loss improved from 0.62200 to 0.60670, saving model to nextword-zare.h5
Epoch 145/150
Epoch 00145: loss improved from 0.60670 to 0.58961, saving model to nextword-zare.h5
Epoch 146/150
Epoch 00146: loss improved from 0.58961 to 0.57825, saving model to nextword-zare.h5
Epoch 147/150
Epoch 00147: loss improved from 0.57825 to 0.56978, saving model to nextword-zare.h5
Epoc

<tensorflow.python.keras.callbacks.History at 0x20da447a248>

### Graph:

In [17]:
# https://stackoverflow.com/questions/26649716/how-to-show-pil-image-in-ipython-notebook
# tensorboard --logdir="./logsnextword1"
# http://DESKTOP-U3TSCVT:6006/

#from IPython.display import Image 
#pil_img = Image(filename='graph1.png')
#display(pil_img)

## Observation:
### We are able to develop a decent next word prediction model and are able to get a declining loss and an overall decent performance.

In [29]:
def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        print(text)
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds = model.predict(sequence)
        preds=np.argmax(preds,axis=1)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return predicted_word

In [None]:
while(True):

    text = input("Enter your line: ")
    
    if text == "stop":
        print("Ending The Program.....")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

Enter your line: رنگی
رنگی
انبار
Enter your line: تصاویر
تصاویر
رنگی
Enter your line: هوش
هوش
مصنوعی
Enter your line: سیاه
سیاه
سفید
