In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

In [29]:
# Use a PDF-specific library like PyPDF2 to read PDF files
import PyPDF2

# Open the PDF file in binary mode ('rb')
with open('sherlock_holmes.pdf', 'rb') as pdf_file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    # Initialize an empty string to store text
    text = ""
    
    # Extract text from each page
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
        
    # Now 'text' contains the readable text from the PDF
    # You can print a sample to verify
    print(text[:10000])  # Print first 500 characters

                        THE ADVENTURES OF SHERLOCK HOLMES
                               Arthur Conan Doyle
                                Table of contents
               A Scandal in Bohemia
               The Red-Headed League
               A Case of Identity
               The Boscombe Valley Mystery
               The Five Orange Pips
               The Man with the Twisted Lip
               The Adventure of the Blue Carbuncle
               The Adventure of the Speckled Band
               The Adventure of the Engineer's Thumb
               The Adventure of the Noble Bachelor
               The Adventure of the Beryl Coronet
               The Adventure of the Copper Beeches
                              A SCANDAL IN BOHEMIA
                                Table of contents
                                     Chapter 1
                                     Chapter 2
                                     Chapter 3
          CHAPTER I
     To Sherlock Holmes she is always the wo

# Data Preprocessing

The tokenizer builds a vocabulary from the text and assigns integer indices to words. The total_words variable represents the vocabulary size including padding, which is required for defining the embedding layer.

In [None]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([text])
total_words = len(mytokenizer.word_index) + 1

In [4]:
mytokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'of': 5,
 'a': 6,
 'in': 7,
 'that': 8,
 'it': 9,
 'he': 10,
 'you': 11,
 'was': 12,
 'his': 13,
 'is': 14,
 'my': 15,
 'have': 16,
 'as': 17,
 'with': 18,
 'had': 19,
 'which': 20,
 'at': 21,
 'for': 22,
 'but': 23,
 'me': 24,
 'not': 25,
 'be': 26,
 'we': 27,
 'from': 28,
 'there': 29,
 'this': 30,
 'said': 31,
 'upon': 32,
 'so': 33,
 'holmes': 34,
 'him': 35,
 'her': 36,
 'she': 37,
 "'": 38,
 'very': 39,
 'your': 40,
 'been': 41,
 'all': 42,
 'on': 43,
 'no': 44,
 'what': 45,
 'one': 46,
 'then': 47,
 'were': 48,
 'by': 49,
 'are': 50,
 'an': 51,
 'would': 52,
 'out': 53,
 'when': 54,
 'up': 55,
 'man': 56,
 'could': 57,
 'has': 58,
 'do': 59,
 'into': 60,
 'mr': 61,
 'who': 62,
 'little': 63,
 'will': 64,
 'if': 65,
 'some': 66,
 'now': 67,
 'see': 68,
 'down': 69,
 'should': 70,
 'our': 71,
 'or': 72,
 'they': 73,
 'may': 74,
 'well': 75,
 'am': 76,
 'us': 77,
 'over': 78,
 'more': 79,
 'think': 80,
 'room': 81,
 'know': 82,
 'shall': 83

This code generates n-gram sequences from tokenized text to create training samples for next-word prediction using an LSTM model.

In [5]:
my_input_seq = []

for line in text.split('\n'):
    token_list = mytokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        my_n_gram_seq = token_list[:i+1]
        my_input_seq.append(my_n_gram_seq)

In [6]:
my_input_seq

[[1, 1561],
 [1, 1561, 5],
 [1, 1561, 5, 129],
 [1, 1561, 5, 129, 34],
 [647, 4501],
 [647, 4501, 4502],
 [226, 5],
 [226, 5, 1562],
 [6, 827],
 [6, 827, 7],
 [6, 827, 7, 871],
 [1, 234],
 [1, 234, 462],
 [1, 234, 462, 648],
 [6, 110],
 [6, 110, 5],
 [6, 110, 5, 2073],
 [1, 678],
 [1, 678, 1360],
 [1, 678, 1360, 499],
 [1, 294],
 [1, 294, 949],
 [1, 294, 949, 872],
 [1, 56],
 [1, 56, 18],
 [1, 56, 18, 1],
 [1, 56, 18, 1, 1014],
 [1, 56, 18, 1, 1014, 873],
 [1, 577],
 [1, 577, 5],
 [1, 577, 5, 1],
 [1, 577, 5, 1, 423],
 [1, 577, 5, 1, 423, 1361],
 [1, 577],
 [1, 577, 5],
 [1, 577, 5, 1],
 [1, 577, 5, 1, 1786],
 [1, 577, 5, 1, 1786, 763],
 [1, 577],
 [1, 577, 5],
 [1, 577, 5, 1],
 [1, 577, 5, 1, 3186],
 [1, 577, 5, 1, 3186, 649],
 [1, 577],
 [1, 577, 5],
 [1, 577, 5, 1],
 [1, 577, 5, 1, 764],
 [1, 577, 5, 1, 764, 1246],
 [1, 577],
 [1, 577, 5],
 [1, 577, 5, 1],
 [1, 577, 5, 1, 2074],
 [1, 577, 5, 1, 2074, 394],
 [1, 577],
 [1, 577, 5],
 [1, 577, 5, 1],
 [1, 577, 5, 1, 828],
 [1, 577, 5, 

In [7]:
max_seq_len = max([len(seq) for seq in my_input_seq])
input_seq = np.array(pad_sequences(my_input_seq,maxlen = max_seq_len,padding = 'pre'))

In [8]:
input_seq[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    1, 1561],
      dtype=int32)

In [9]:
x = input_seq[:,:-1]
y = input_seq[:,-1]

In [10]:
x[0].shape

(31,)

In [11]:
y[0]

np.int32(1561)

In [12]:
y = np.array(tf.keras.utils.to_categorical(y,num_classes = total_words))

In [13]:
y[1]

array([0., 0., 0., ..., 0., 0., 0.], shape=(8207,))

# Model Building

TF-IDF and Bag-of-Words ignore word order and produce sparse vectors, which are unsuitable for sequence modeling. Since next-word prediction requires sequential context, we use an embedding layer with LSTM to preserve word order and learn dense semantic representations.

In [17]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq_len - 1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.build(input_shape=(None, max_seq_len - 1))
model.summary()

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x,y,epochs = 50,verbose = 1)

Epoch 1/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 30ms/step - accuracy: 0.0763 - loss: 6.2385
Epoch 2/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 29ms/step - accuracy: 0.1242 - loss: 5.5126
Epoch 3/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 30ms/step - accuracy: 0.1481 - loss: 5.1282
Epoch 4/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 29ms/step - accuracy: 0.1662 - loss: 4.7986
Epoch 5/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 28ms/step - accuracy: 0.1824 - loss: 4.4941
Epoch 6/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 27ms/step - accuracy: 0.2017 - loss: 4.2100
Epoch 7/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 28ms/step - accuracy: 0.2257 - loss: 3.9420
Epoch 8/50
[1m3016/3016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 28ms/step - accuracy: 0.2546 - loss: 3.6888
Epoch 9/

<keras.src.callbacks.history.History at 0x1280d4800>

# Prediction

In [2]:
input_text = "Once upon a time when"
input_seq = mytokenizer.texts_to_sequences([input_text])
input_seq = pad_sequences(input_seq, maxlen=max_seq_len - 1)
predicted_word_index = model.predict(input_seq)
predicted_word = mytokenizer.index_word[np.argmax(predicted_word_index)]

NameError: name 'mytokenizer' is not defined

In [24]:
predicted_word

'she'

In [32]:
input_text = "Sherlock Holmes"
predict_next_words = 10

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])
    token_list = pad_sequences(token_list, maxlen=max_seq_len - 1)
    predicted_word_index = model.predict(token_list)
    predicted_word = mytokenizer.index_word[np.argmax(predicted_word_index)]
    input_text += " " + predicted_word

print(input_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Sherlock Holmes had been leaning back in his chair with his eyes


In [33]:
model.save("next_word_model.keras")

In [34]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(mytokenizer, f)

In [35]:
import json

with open("config.json", "w") as f:
    json.dump({"max_seq_len": max_seq_len}, f)