In [25]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout

In [5]:
df = pd.read_table(
    "../data/data.txt",
    header=None,
    names=["text"],
    encoding="utf-8"
)

df

Unnamed: 0,text
0,THE ADVENTURES OF SHER...
1,Arthur Conan Doyle
2,Table of contents
3,A Scandal in Bohemia
4,The Red-Headed League
...,...
9561,warranties of merchantablity or fitness f...
9562,This text was formatted from various free...
9563,See http://sherlock-holm.es for an electr...
9564,additional information about it.


In [6]:
df['text'] = df['text'].str.strip()

In [7]:
texts = df['text'].drop_duplicates().astype(str).tolist()


In [8]:
document = '\n'.join(texts).lower()


In [9]:
sentences = document.split('\n')

In [11]:
# Tokenizer with oov_token

tokenizer = Tokenizer(oov_token='<unkn>')

In [12]:
tokenizer.fit_on_texts(sentences)


In [13]:
tokenizer.word_index


{'<unkn>': 1,
 'the': 2,
 'and': 3,
 'i': 4,
 'to': 5,
 'of': 6,
 'a': 7,
 'in': 8,
 'that': 9,
 'it': 10,
 'he': 11,
 'you': 12,
 'was': 13,
 'his': 14,
 'is': 15,
 'my': 16,
 'have': 17,
 'as': 18,
 'with': 19,
 'had': 20,
 'which': 21,
 'at': 22,
 'for': 23,
 'but': 24,
 'me': 25,
 'not': 26,
 'be': 27,
 'we': 28,
 'from': 29,
 'there': 30,
 'this': 31,
 'said': 32,
 'upon': 33,
 'so': 34,
 'holmes': 35,
 'him': 36,
 'her': 37,
 'she': 38,
 "'": 39,
 'very': 40,
 'your': 41,
 'been': 42,
 'all': 43,
 'on': 44,
 'what': 45,
 'no': 46,
 'one': 47,
 'then': 48,
 'were': 49,
 'by': 50,
 'are': 51,
 'an': 52,
 'would': 53,
 'out': 54,
 'when': 55,
 'up': 56,
 'man': 57,
 'could': 58,
 'has': 59,
 'do': 60,
 'into': 61,
 'mr': 62,
 'who': 63,
 'little': 64,
 'will': 65,
 'if': 66,
 'some': 67,
 'now': 68,
 'see': 69,
 'down': 70,
 'should': 71,
 'our': 72,
 'or': 73,
 'they': 74,
 'may': 75,
 'well': 76,
 'am': 77,
 'us': 78,
 'over': 79,
 'more': 80,
 'think': 81,
 'room': 82,
 'know': 8

In [15]:
# vocab size

vocab_size = len(tokenizer.word_index) + 1
print('Vocab Size:' , vocab_size)

Vocab Size: 8201


In [16]:
sequences = tokenizer.texts_to_sequences(sentences)
# Build N-gram sequences

training_sequences = []

for seq in sequences:
  # print(seq)
  for i in range(1, len(seq)):
    training_sequences.append(seq[:i+1])


In [17]:
# Max length

max_len = max([len(seq) for seq in sequences])

print('Max_Length:', max_len)

Max_Length: 18


In [18]:
# Pad Sequences

padded_sequences = pad_sequences(training_sequences, maxlen= max_len, padding='pre')

padded_sequences


array([[   0,    0,    0, ...,    0,    2, 1562],
       [   0,    0,    0, ...,    2, 1562,    6],
       [   0,    0,    0, ..., 1562,    6,  130],
       ...,
       [   0,    0,    0, ...,    2, 8199, 8200],
       [   0,    0,    0, ..., 8199, 8200, 3187],
       [   0,    0,    0, ..., 8200, 3187, 3186]], dtype=int32)

In [19]:
X = padded_sequences[:, :-1]

y = padded_sequences[:, -1]

In [20]:
# Build LSTM Model

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim= 100),
    LSTM(150),
    Dense(vocab_size, activation='softmax')
])

model.build(input_shape=(None, max_len))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.summary()

2026-01-27 04:54:09.849393: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2026-01-27 04:54:09.849518: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2026-01-27 04:54:09.849532: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2026-01-27 04:54:09.849566: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2026-01-27 04:54:09.849582: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         820100    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 8201)              1238351   
                                                                 
Total params: 2209051 (8.43 MB)
Trainable params: 2209051 (8.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
history = model.fit(X, y, epochs=30, batch_size=32)


Epoch 1/30


2026-01-27 04:54:21.485796: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [22]:
def predict_next_word(model, tokenizer, text, max_len):

    # Convert text → token sequence
    token_seq = tokenizer.texts_to_sequences([text])

    # Pad sequence (same as training)
    padded_seq = pad_sequences(
        token_seq,
        maxlen=max_len - 1,
        padding='pre'
    )

    # Predict probabilities
    pred = model.predict(padded_seq, verbose=0)

    # Get index of highest probability
    index = np.argmax(pred)

    # Convert index → word
    next_word = "<unkn>"
    for word, idx in tokenizer.word_index.items():
        if idx == index:
            next_word = word
            break

    # Append predicted word
    text = text + " " + next_word

    return text


In [23]:
def generate_next_word(model, tokenizer, seed_text, max_len, max_words=10):

    text = seed_text

    for _ in range(max_words):
        # Convert text → tokens
        token_seq = tokenizer.texts_to_sequences([text])

        # Pad same as training
        padded_seq = pad_sequences(
            token_seq,
            maxlen=max_len - 1,
            padding="pre"
        )

        # Predict
        pred = model.predict(padded_seq, verbose=0)
        index = np.argmax(pred)

        # Convert index → word
        next_word = "<unkn>"
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                next_word = word
                break

        text = text + " " + next_word

    return text


In [31]:
generate_next_word(
    model,
    tokenizer,
    "WHy are you replying me so dump",
    max_len,
    max_words=10
)

'WHy are you replying me so dump as possible and here you see but you see it'