## importing required libraries

In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer #used for text tokenization.
from tensorflow.keras.layers import Embedding,LSTM,Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical #used to converting class vectors
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import pandas as pd
import os

In [13]:
file= pd.read_csv("text.txt",sep='\t')  # this use when we read tab seperated file

In [14]:
file.head()

Unnamed: 0,The Project Gutenberg eBook of Pride and Prejudice
0,This ebook is for the use of anyone anywhere i...
1,most other parts of the world at no cost and w...
2,"whatsoever. You may copy it, give it away or r..."
3,of the Project Gutenberg License included with...
4,"at If you are not located in the United States,"


## load and pre-procces the data

In [15]:
file= open('text.txt','r',encoding ='utf8')
#store file in list
lines=[]
for i in file:
    lines.append(i)
#lines
# now we convert it into string
data=""
for i in lines:
    data= " ".join(lines)
#data

In [16]:
# now we replace unnecessary stuff with space
data =data.replace('\n','').replace('\r','').replace('\ufeff','').replace('“','').replace('”','')
data
#now again we split our data then to string so we can remove unnecessary space
data = data.split()
data="  ".join(data)
len(data)

864077

## Apply Tokenization


##### fit_on_texts :  Updates internal vocabulary based on a list of texts. This method creates the vocabulary index based on word frequency. So if you give it something like, "The cat sat on the mat." It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value. 0 is reserved for padding. So lower integer means more frequent word (often the first few are stop words because they appear a lot).

##### 
texts_to_sequences:  Transforms each text in texts to a sequence of integers. So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary. Nothing more, nothing less, certainly no magic involved.

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])  #update the internal vocabulary of the tokenizer
#saveing the tokenizer for predict function
pickle.dump(tokenizer,open('token.pkl','wb'))
# to get the first sequence (assuming there's only one sequence in the list).
sequence_data = tokenizer.texts_to_sequences([data])[0]
len(sequence_data)
# hence we notic the that len is decreases  because in ext file so manay words are 
# repeated and each unique word get one numeric representaion
#print(sequence_data)

131175

In [18]:
vocab_size =len(tokenizer.word_index) + 1
vocab_size

7254

In [19]:
sequences =[]
# since we are training for the pridiction of next_words after sequence of 3 words
for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
sequences = np.array(sequences)
# lets print first 10 sequences
sequences[:10]
# as we can below in 2d array the first 3 element will be input and 4 will be the output

array([[   1,  181,  403, 1000],
       [ 181,  403, 1000,    3],
       [ 403, 1000,    3,  298],
       [1000,    3,  298,    4],
       [   3,  298,    4,  946],
       [ 298,    4,  946,   41],
       [   4,  946,   41, 1000],
       [ 946,   41, 1000,   23],
       [  41, 1000,   23,   21],
       [1000,   23,   21,    1]])

In [20]:
# now we seperate input and output
x=[]
y=[]

for i in sequences:
    x.append(i[0:3])
    y.append(i[3])
x=np.array(x)
y=np.array(y)

In [21]:
len(y)

131172

In [22]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.2, random_state=42)

## Createing the model

In [24]:
#input_length =3
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             72540     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 7254)              7261254   
                                                                 
Total params: 20382794 (77.75 MB)
Trainable params: 20382794 (77.75 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Training  the model

In [15]:
# we will use ModelCheckpoint which save our model and wait at some intervals 

from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5",monitor="loss",verbose=1,save_best_only =True)
model.compile(
    loss="categorical_crossentropy",
    optimizer="Adam",
    metrics=['accuracy'])
model.fit(x_train,y_train,batch_size=64,callbacks=[checkpoint],epochs=50)

Epoch 1/50


Epoch 1: loss improved from inf to 6.31369, saving model to next_words.h5


  saving_api.save_model(


Epoch 2/50
Epoch 2: loss improved from 6.31369 to 5.68695, saving model to next_words.h5
Epoch 3/50
Epoch 3: loss improved from 5.68695 to 5.35237, saving model to next_words.h5
Epoch 4/50
Epoch 4: loss improved from 5.35237 to 5.12519, saving model to next_words.h5
Epoch 5/50
Epoch 5: loss improved from 5.12519 to 4.91728, saving model to next_words.h5
Epoch 6/50
Epoch 6: loss improved from 4.91728 to 4.70690, saving model to next_words.h5
Epoch 7/50
Epoch 7: loss improved from 4.70690 to 4.48965, saving model to next_words.h5
Epoch 8/50
Epoch 8: loss improved from 4.48965 to 4.26960, saving model to next_words.h5
Epoch 9/50
Epoch 9: loss improved from 4.26960 to 4.03836, saving model to next_words.h5
Epoch 10/50
Epoch 10: loss improved from 4.03836 to 3.80225, saving model to next_words.h5
Epoch 11/50
Epoch 11: loss improved from 3.80225 to 3.56370, saving model to next_words.h5
Epoch 12/50
Epoch 12: loss improved from 3.56370 to 3.32702, saving model to next_words.h5
Epoch 13/50
Epo

<keras.src.callbacks.History at 0x1d671599710>

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle
 
# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))
 
def Predict_Next_Words(model, tokenizer, text):
 
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
   
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
   
  print(predicted_word)
  return predicted_word
while(True):
  text = input("Enter your line: ")
   
  if text == "0":
      print("Execution completed.....")
      break
   
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
         
          Predict_Next_Words(model, tokenizer, text)
           
      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line:  The Project Gutenberg eBook of


['Gutenberg', 'eBook', 'of']
pride


Enter your line:  i am your friend 


['your', 'friend', '']
to


Enter your line:  how can you abuse your own


['abuse', 'your', 'own']
but


Enter your line:  He could not help seeing that you were about five times as


['five', 'times', 'as']
she
