In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import pickle

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils.vis_utils import plot_model

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from google.colab import files
uploaded = files.upload()

Saving sherlockholmes.txt to sherlockholmes.txt


**1. Building the Dataset from .txt**

In [3]:
file = open("sherlockholmes.txt", "r", encoding = "utf8")
old_lines = []
for i in file:
    old_lines.append(i)

lines = []
z = len(old_lines)
for i in range(0,len(old_lines)):
  if i>57 and i<z-373:
    lines.append(old_lines[i])

data = ""
for i in lines:
  data = ' '. join(lines) 

data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','').replace('I.','').replace('_','')

data = data.split()
data = ' '.join(data)
print("Total Characters in Dataset = " + str(len(data)))
print(data[0:500])

Total Characters in Dataset = 553406
A SCANDAL IN BOHEMIA To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a


**2. Vectorization Method - Bag of Words**

In [15]:
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit([data])
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
sequence_data = vectorizer.transform([data]).toarray()[0]
vocab = vectorizer.vocabulary_
vocab_size = len(vectorizer.vocabulary_.keys())

print("No. of Words in our Vocabulary = " + str(vocab_size))

sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("No. of 3-word sequences are =", len(sequences))
sequences = np.array(sequences)

No. of Words in our Vocabulary = 7863
No. of 3-word sequences are = 7860
[[0.00053091 0.00010618 0.00042472 0.00084945]
 [0.00010618 0.00042472 0.00084945 0.00021236]
 [0.00042472 0.00084945 0.00021236 0.00010618]
 ...
 [0.00074327 0.00010618 0.00021236 0.00010618]
 [0.00010618 0.00021236 0.00010618 0.00010618]
 [0.00021236 0.00010618 0.00010618 0.00010618]]


In [None]:
#building training sets
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

y = to_categorical(y, num_classes=vocab_size)

**3. Building and Training Model**

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=3))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(vocab_size, activation='softmax'))
adam = Adam(lr=0.001)
model.summary()

In [None]:
plot_model(model, to_file='plot.png', show_layer_names=True)

In [None]:
checkpoint = ModelCheckpoint("final_model0.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
history = model.fit(X, y, epochs=20, batch_size=64, callbacks=[checkpoint])

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.1, 1])
plt.legend(loc='lower right')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0.1, 1])
plt.legend(loc='lower right')
plt.show()

**4. Predicting the word.**

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('drive/MyDrive/Lab/SEM7/ROSPLab/next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

text = input("Enter your line: ")
  
if text == "0":
    print("Execution completed.....")
else:
    text = text.split(" ")
    text = text[-3:]
    print(text)
        
Predict_Next_Words(model, tokenizer, text)