In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

import pickle 
import numpy as np
import os

from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [2]:
from google.colab import files
uploaded = files.upload()

Saving dracula1.txt to dracula1.txt


# **Load and Preprocess**

In [3]:
file = open("dracula1.txt", 'r', encoding ='utf8')

# store file in list
lines =[]
for i in file:
  lines.append(i)

# convert list to string
data = " "
for i in lines:
  data = ' '.join(lines)

# replace unnecessary stuff with space
data  = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('"', '').replace('"', '')

# remove unnecessary sapce
data = data.split()
data = ' '.join(data)
data[:500]


'Excuse me, I said, but the answer is here. I laid my hand on the type-written matter. When our sane and learned lunatic made that very Left Munich at 8:35 P. M., on 1st May, arriving at Vienna early next morning; should have arrived at 6:46, but train was an hour late. Buda-Pesth seems a wonderful place, from the glimpse which I got of it from the train and the little I could walk through the streets. I feared to go very far from the station, as we had arrived late and would start as near the co'

In [None]:
len(data)

12068

# **Tokenization**

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

#saving tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[262, 38, 2, 35, 19, 1, 263, 26, 48, 2, 264, 10, 162, 15, 1]

In [5]:
len(sequence_data)

2316

In [14]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

767


In [10]:
sequences = []

for i in range(3, len(sequence_data)):
  words = sequence_data[i-3: i+1]
  sequences.append(words)

print("length of sequences are ", len(sequences))
sequences = np.array(sequences)
sequences[:10] 


length of sequences are  2313


array([[262,  38,   2,  35],
       [ 38,   2,  35,  19],
       [  2,  35,  19,   1],
       [ 35,  19,   1, 263],
       [ 19,   1, 263,  26],
       [  1, 263,  26,  48],
       [263,  26,  48,   2],
       [ 26,  48,   2, 264],
       [ 48,   2, 264,  10],
       [  2, 264,  10, 162]])

In [11]:
X = []
y = []

for i in sequences:
  X.append(i[0:3])
  y.append(i[3])

X = np.array(X)
y = np.array(y)  



In [12]:
print("Data ", X[:10])
print( "Response ", y[:10])

Data  [[262  38   2]
 [ 38   2  35]
 [  2  35  19]
 [ 35  19   1]
 [ 19   1 263]
 [  1 263  26]
 [263  26  48]
 [ 26  48   2]
 [ 48   2 264]
 [  2 264  10]]
Response  [ 35  19   1 263  26  48   2 264  10 162]


In [15]:
y = to_categorical(y, num_classes = vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length = 3))
model.add(LSTM(500, return_sequences = True))
model.add(LSTM(500))
model.add(Dense(500, activation = "relu"))
model.add(Dense(vocab_size, activation = "softmax"))

In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3, 10)             7670      
                                                                 
 lstm_2 (LSTM)               (None, 3, 500)            1022000   
                                                                 
 lstm_3 (LSTM)               (None, 500)               2002000   
                                                                 
 dense_2 (Dense)             (None, 500)               250500    
                                                                 
 dense_3 (Dense)             (None, 767)               384267    
                                                                 
Total params: 3,666,437
Trainable params: 3,666,437
Non-trainable params: 0
_________________________________________________________________


In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("words.h5", monitor = 'loss', verbose =1, save_best_only = True)

model.compile(loss = 'categorical_crossentropy', optimizer = Adam(learning_rate = 0.001))

model.fit(X, y, epochs = 70, batch_size = 64, callbacks =[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 6.25778, saving model to words.h5
Epoch 2/70
Epoch 2: loss improved from 6.25778 to 5.79588, saving model to words.h5
Epoch 3/70
Epoch 3: loss improved from 5.79588 to 5.74576, saving model to words.h5
Epoch 4/70
Epoch 4: loss improved from 5.74576 to 5.70076, saving model to words.h5
Epoch 5/70
Epoch 5: loss improved from 5.70076 to 5.64297, saving model to words.h5
Epoch 6/70
Epoch 6: loss improved from 5.64297 to 5.61449, saving model to words.h5
Epoch 7/70
Epoch 7: loss improved from 5.61449 to 5.56427, saving model to words.h5
Epoch 8/70
Epoch 8: loss improved from 5.56427 to 5.51359, saving model to words.h5
Epoch 9/70
Epoch 9: loss improved from 5.51359 to 5.45645, saving model to words.h5
Epoch 10/70
Epoch 10: loss improved from 5.45645 to 5.40664, saving model to words.h5
Epoch 11/70
Epoch 11: loss improved from 5.40664 to 5.35074, saving model to words.h5
Epoch 12/70
Epoch 12: loss improved from 5.35074 to 5.29497, saving model to

<keras.callbacks.History at 0x7f2a3a5f7210>

In [20]:
from tensorflow.keras.models import load_model
import numpy as np 
import pickle

# load model and tokenize
model = load_model('words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_next_words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = " "

  for key, value in tokenizer.word_index.items():
    if value == preds:
      predicted_word = key
      break

  print(predicted_word)
  return predicted_word  

In [21]:
while(True):
  text = input("Enter your line: ")

  if text == '0':
    print("Execution completed ...")
    break

  else:
    try:
      text = text.split(" ")
      text = text[-3:]
      print(text) 

      Predict_next_words(model, tokenizer, text)

    except Exception as e:
      print("Error_occured: " , e)

      continue

Enter your line: Van Helsing smiled in 
['smiled', 'in', '']
Error_occured:  in user code:

    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1751, in predict_step
        return self(x, training=False)
    File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py", line 264, in assert_input_compatibility
     