In [10]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')
with open('hamlet.txt', 'w') as file:
    file.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## Load the dataset
with open('hamlet.txt', 'r') as f:
    text = f.read().lower()

# Tokenize the text and creating indexes for words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
total_words

2025-03-21 08:09:32.200495: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 08:09:33.072248: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 08:09:33.690234: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742544574.210324   48513 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742544574.463626   48513 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-21 08:09:36.826861: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

4818

In [2]:
## Create the input sequences
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0] # taking the tokens of the words
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [3]:
## Pad sequences
max_sequence = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence, padding='pre'))

In [4]:
input_sequences.shape

(25732, 14)

In [5]:
## Create predictors and label
import tensorflow as tf
x, y = input_sequences[:, :-1], input_sequences[:, -1]

In [6]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [7]:
## Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

In [8]:
## Train LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Input,  LSTM, Dense, Dropout
from tensorflow.keras import backend as K

# Clear the session
K.clear_session()

# Define the model
model = Sequential()
model.add(Input(shape=(max_sequence,)))
model.add(Embedding(total_words, 100, input_length=max_sequence))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

2025-03-21 08:09:50.936931: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [9]:
## Train the model
history = model.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test), verbose=1)

Epoch 1/50


2025-03-21 08:09:52.041325: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 396714120 exceeds 10% of free system memory.


[1m642/644[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - accuracy: 0.0294 - loss: 7.1543

2025-03-21 08:10:15.761548: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 99192984 exceeds 10% of free system memory.


[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 37ms/step - accuracy: 0.0294 - loss: 7.1531 - val_accuracy: 0.0315 - val_loss: 6.7364
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.0379 - loss: 6.4464 - val_accuracy: 0.0392 - val_loss: 6.8329
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 33ms/step - accuracy: 0.0458 - loss: 6.2956 - val_accuracy: 0.0488 - val_loss: 6.8543
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 34ms/step - accuracy: 0.0535 - loss: 6.1611 - val_accuracy: 0.0478 - val_loss: 6.8865
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 32ms/step - accuracy: 0.0591 - loss: 6.0361 - val_accuracy: 0.0484 - val_loss: 6.9442
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 33ms/step - accuracy: 0.0590 - loss: 5.9134 - val_accuracy: 0.0560 - val_loss: 6.9593
Epoch 7/50
[1m644/644[0m 

In [16]:
reversed_word_index = {value : key for key, value in tokenizer.word_index.items()}

In [19]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):] # Ensure the sequence length mathces the max_lenght
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    # for word, index in tokenizer.word_index.items():
    #     if index == predicted_word_index:
    #         return word
    return reversed_word_index.get(predicted_word_index, None)

In [30]:
input_text = 'it was the best of times and it was the'
print(f"Input : {input_text}")
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next word prediction : {next_word}")

Input : it was the best of times and it was the
Next word prediction : maiesty


In [32]:
model.save('next_word_lstm.h5')
model.save('next_word_lstm.keras')

import pickle
# save the tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

