In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

[nltk_data] Downloading package gutenberg to /home/abhi/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
import pandas as pd
data = gutenberg.raw('shakespeare-macbeth.txt')
with open('macbeth.txt', 'w') as f:
    f.write(data)

In [3]:
## Data preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

2025-12-10 16:52:09.804957: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-10 16:52:09.848976: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-10 16:52:10.973122: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
with open('macbeth.txt', 'r') as f:
    text = f.read().lower()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

total_words

3553

In [5]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'i': 5,
 'a': 6,
 'that': 7,
 'my': 8,
 'you': 9,
 'in': 10,
 'is': 11,
 'not': 12,
 'it': 13,
 'with': 14,
 'his': 15,
 'be': 16,
 'macb': 17,
 'your': 18,
 'our': 19,
 'haue': 20,
 'but': 21,
 'me': 22,
 'he': 23,
 'for': 24,
 'what': 25,
 'this': 26,
 'all': 27,
 'so': 28,
 'him': 29,
 'as': 30,
 'thou': 31,
 'we': 32,
 'enter': 33,
 'which': 34,
 'are': 35,
 'will': 36,
 'they': 37,
 'shall': 38,
 'no': 39,
 'then': 40,
 'macbeth': 41,
 'their': 42,
 'thee': 43,
 'vpon': 44,
 'on': 45,
 'macd': 46,
 'from': 47,
 'yet': 48,
 'thy': 49,
 'vs': 50,
 'come': 51,
 'king': 52,
 'now': 53,
 'at': 54,
 'hath': 55,
 'more': 56,
 'by': 57,
 'good': 58,
 'rosse': 59,
 'them': 60,
 'lady': 61,
 'would': 62,
 'time': 63,
 'was': 64,
 'do': 65,
 'who': 66,
 'like': 67,
 'her': 68,
 'if': 69,
 'should': 70,
 'did': 71,
 'when': 72,
 'there': 73,
 'say': 74,
 'were': 75,
 'where': 76,
 'doe': 77,
 'lord': 78,
 'make': 79,
 'or': 80,
 '1': 81,
 'must': 82,

In [6]:
#Create input sequences
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences

[[1, 885],
 [1, 885, 4],
 [1, 885, 4, 41],
 [1, 885, 4, 41, 57],
 [1, 885, 4, 41, 57, 1388],
 [1, 885, 4, 41, 57, 1388, 1389],
 [1, 885, 4, 41, 57, 1388, 1389, 1390],
 [418, 1391],
 [418, 1391, 1392],
 [418, 1391, 1392, 419],
 [270, 2],
 [270, 2, 886],
 [270, 2, 886, 33],
 [270, 2, 886, 33, 196],
 [270, 2, 886, 33, 196, 298],
 [81, 72],
 [81, 72, 38],
 [81, 72, 38, 32],
 [81, 72, 38, 32, 196],
 [81, 72, 38, 32, 196, 336],
 [81, 72, 38, 32, 196, 336, 131],
 [10, 270],
 [10, 270, 886],
 [10, 270, 886, 80],
 [10, 270, 886, 80, 10],
 [10, 270, 886, 80, 10, 1393],
 [128, 72],
 [128, 72, 1],
 [128, 72, 1, 1394],
 [128, 72, 1, 1394, 1395],
 [128, 72, 1, 1394, 1395, 84],
 [72, 1],
 [72, 1, 1396],
 [72, 1, 1396, 365],
 [72, 1, 1396, 365, 2],
 [72, 1, 1396, 365, 2, 887],
 [135, 7],
 [135, 7, 36],
 [135, 7, 36, 16],
 [135, 7, 36, 16, 172],
 [135, 7, 36, 16, 172, 1],
 [135, 7, 36, 16, 172, 1, 299],
 [135, 7, 36, 16, 172, 1, 299, 4],
 [135, 7, 36, 16, 172, 1, 299, 4, 666],
 [81, 76],
 [81, 76, 1],


In [7]:
##apply pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4],
       [   0,    0,    0, ...,  885,    4,   41],
       ...,
       [   0,    0,    0, ..., 3552,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4],
       [   0,    0,    0, ...,  885,    4,   41]],
      shape=(15245, 14), dtype=int32)

In [8]:
## Split into train and test sets
# Force CPU to avoid GPU JIT issues
# Add this as the first cell in the notebook, before any TensorFlow imports
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')  # Explicitly disable GPU
X,y = input_sequences[:,:-1], input_sequences[:,-1]

2025-12-10 16:52:11.667267: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2025-12-10 16:52:11.667475: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:160] env: CUDA_VISIBLE_DEVICES="-1"
2025-12-10 16:52:11.667483: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] CUDA_VISIBLE_DEVICES is set to -1 - this hides all GPUs from CUDA
2025-12-10 16:52:11.667487: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:171] verbose logging is disabled. Rerun with verbose logging (usually --v=1 or --vmodule=cuda_diagnostics=1) to get more diagnostic output from this module
2025-12-10 16:52:11.667490: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:176] retrieving CUDA diagnostic information for host: CHENWJYHD6Q3
2025-12-10 16:52:11.667493: I external/local_xla/xla/stream_executor/cuda/

In [9]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4],
       ...,
       [   0,    0,    0, ...,    0, 3552,    1],
       [   0,    0,    0, ..., 3552,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4]],
      shape=(15245, 13), dtype=int32)

In [10]:
y

array([885,   4,  41, ..., 885,   4,  41], shape=(15245,), dtype=int32)

In [11]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(15245, 3553))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
#Train LSTM 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input

# Build the model
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Embedding(total_words, 100))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

#Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [17]:
#Train the LSTM model
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), verbose=1)

Epoch 1/100
[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.2141 - loss: 3.8657 - val_accuracy: 0.0469 - val_loss: 9.4006
Epoch 2/100
[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.2268 - loss: 3.7931 - val_accuracy: 0.0485 - val_loss: 9.5475
Epoch 3/100
[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.2396 - loss: 3.7188 - val_accuracy: 0.0495 - val_loss: 9.6334
Epoch 4/100
[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.2514 - loss: 3.6567 - val_accuracy: 0.0489 - val_loss: 9.7326
Epoch 5/100
[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.2613 - loss: 3.5803 - val_accuracy: 0.0449 - val_loss: 9.8426
Epoch 6/100
[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.2698 - loss: 3.5264 - val_accuracy: 0.0469 - val_loss: 9.9407
Epoch 7/10

In [18]:
#Function to predict next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)    
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [19]:
input_text = "to be or not to"
max_sequence_len = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Input Text: '{input_text}' -> Predicted Next Word: '{next_word}'")

Input Text: 'to be or not to' -> Predicted Next Word: 'him'


In [20]:
#Save the model
model.save('lstm_macbeth_model.h5')

#save tokenizer
import pickle
with open('tokenizer_macbeth.pkl', 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)



In [21]:
input_text = "that doe cling"
max_sequence_len = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Input Text: '{input_text}' -> Predicted Next Word: '{next_word}'")

Input Text: 'that doe cling' -> Predicted Next Word: 'or'


In [23]:
#GRU Model
from tensorflow.keras.layers import GRU

# Build the model
model_gru = Sequential()
model_gru.add(Input(shape=(X_train.shape[1],)))
model_gru.add(Embedding(total_words, 100))
model_gru.add(GRU(150, return_sequences=True))
model_gru.add(Dropout(0.2))
model_gru.add(GRU(100))
model_gru.add(Dense(total_words, activation='softmax'))

#Compile the model
model_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_gru.summary()