In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:
file = open("mytext.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:500]

'Chapter 1 It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. My dear Mr. Bennet, said his lady to him one day, have you heard that Netherfield Park is let at last? Mr. Benn'

In [3]:
len(data)

4398

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[122, 123, 9, 10, 4, 76, 124, 125, 8, 4, 43, 44, 12, 77, 2]

In [5]:
len(sequence_data)

854

In [6]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

324


In [7]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]


The Length of sequences are:  851


array([[122, 123,   9,  10],
       [123,   9,  10,   4],
       [  9,  10,   4,  76],
       [ 10,   4,  76, 124],
       [  4,  76, 124, 125],
       [ 76, 124, 125,   8],
       [124, 125,   8,   4],
       [125,   8,   4,  43],
       [  8,   4,  43,  44],
       [  4,  43,  44,  12]])

In [8]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [9]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[122 123   9]
 [123   9  10]
 [  9  10   4]
 [ 10   4  76]
 [  4  76 124]
 [ 76 124 125]
 [124 125   8]
 [125   8   4]
 [  8   4  43]
 [  4  43  44]]
Response:  [ 10   4  76 124 125   8   4  43  44  12]


In [10]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3, 10)             3240      
_________________________________________________________________
lstm (LSTM)                  (None, 3, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 324)               324324    
Total params: 13,376,564
Trainable params: 13,376,564
Non-trainable params: 0
_________________________________________________________________


In [13]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='plot.png', show_layer_names=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", verbose=1, monitor='loss', save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.01)) #0.001 
model.fit(X, y, epochs=100, batch_size=64, callbacks=[checkpoint])

Epoch 1/70

Epoch 00001: loss improved from inf to 6.32772, saving model to next_words.h5
Epoch 2/70

Epoch 00002: loss did not improve from 6.32772
Epoch 3/70

Epoch 00003: loss improved from 6.32772 to 5.36210, saving model to next_words.h5
Epoch 4/70

Epoch 00004: loss improved from 5.36210 to 5.25866, saving model to next_words.h5
Epoch 5/70

Epoch 00005: loss improved from 5.25866 to 5.18577, saving model to next_words.h5
Epoch 6/70

Epoch 00006: loss improved from 5.18577 to 5.08615, saving model to next_words.h5
Epoch 7/70

Epoch 00007: loss improved from 5.08615 to 5.06809, saving model to next_words.h5
Epoch 8/70

Epoch 00008: loss improved from 5.06809 to 4.97977, saving model to next_words.h5
Epoch 9/70

Epoch 00009: loss improved from 4.97977 to 4.82975, saving model to next_words.h5
Epoch 10/70

Epoch 00010: loss improved from 4.82975 to 4.69551, saving model to next_words.h5
Epoch 11/70

Epoch 00011: loss improved from 4.69551 to 4.54486, saving model to next_words.h5
Epo

Epoch 52/70

Epoch 00052: loss improved from 0.51789 to 0.48941, saving model to next_words.h5
Epoch 53/70

Epoch 00053: loss improved from 0.48941 to 0.40078, saving model to next_words.h5
Epoch 54/70

Epoch 00054: loss improved from 0.40078 to 0.35228, saving model to next_words.h5
Epoch 55/70

Epoch 00055: loss improved from 0.35228 to 0.33181, saving model to next_words.h5
Epoch 56/70

Epoch 00056: loss improved from 0.33181 to 0.29569, saving model to next_words.h5
Epoch 57/70

Epoch 00057: loss did not improve from 0.29569
Epoch 58/70

Epoch 00058: loss did not improve from 0.29569
Epoch 59/70

Epoch 00059: loss did not improve from 0.29569
Epoch 60/70

Epoch 00060: loss did not improve from 0.29569
Epoch 61/70

Epoch 00061: loss did not improve from 0.29569
Epoch 62/70

Epoch 00062: loss did not improve from 0.29569
Epoch 63/70

Epoch 00063: loss did not improve from 0.29569
Epoch 64/70

Epoch 00064: loss did not improve from 0.29569
Epoch 65/70

Epoch 00065: loss did not improv

<keras.callbacks.History at 0x2871a32e7c0>

In [15]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [16]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line: However little known the feelings or views
['feelings', 'or', 'views']
of
Enter your line: feelings or views of such a man may be
['man', 'may', 'be']
on
Enter your line: Do not you want to know who has
['know', 'who', 'has']
taken
Enter your line: and I have no objection to
['no', 'objection', 'to']
hearing
Enter your line: I have no objection to hearing
['objection', 'to', 'hearing']
it
Enter your line: my dear, to be sure! A single
['sure!', 'A', 'single']
chooses
Enter your line: You must know that I am thinking of
['am', 'thinking', 'of']
his
Enter your line: 0
Execution completed.....
