In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
data = gutenberg.raw('shakespeare-hamlet.txt')

In [3]:
with open('hamlet.txt', 'w') as file:
  file.write(data)

In [4]:
with open('hamlet.txt', 'r') as file:
  text = file.read().lower()

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

In [6]:
total_words = len(tokenizer.word_index) + 1
total_words

4818

In [7]:
input_sequences = []
for line in text.split('\n'):
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequences = token_list[: i + 1]
    input_sequences.append(n_gram_sequences)

In [8]:
max_sequence_len = max([len(x) for x in input_sequences])

In [9]:
max_sequence_len

14

In [10]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [11]:
import tensorflow as tf

X, y = input_sequences[:, : -1], input_sequences[:, -1]

In [12]:
y = tf.keras.utils.to_categorical(y, num_classes = total_words)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
# Train the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [15]:
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation = 'softmax'))

In [16]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

In [17]:
history = model.fit(
    X_train, y_train,
    epochs = 100,
    validation_data = (X_test, y_test),
    verbose = 1
    )

Epoch 1/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 12ms/step - accuracy: 0.0275 - loss: 7.1494 - val_accuracy: 0.0356 - val_loss: 6.6996
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.0352 - loss: 6.4404 - val_accuracy: 0.0455 - val_loss: 6.7856
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.0420 - loss: 6.3426 - val_accuracy: 0.0519 - val_loss: 6.8522
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.0496 - loss: 6.1844 - val_accuracy: 0.0517 - val_loss: 6.8818
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.0526 - loss: 6.0718 - val_accuracy: 0.0542 - val_loss: 6.8969
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.0643 - loss: 5.9079 - val_accuracy: 0.0591 - val_loss: 6.9044
Epoch 7/100

In [18]:
model.save('next_word_lstm.keras')

In [23]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
  token_list = tokenizer.texts_to_sequences([text])[0]
  if len(token_list) >= max_sequence_len:
    token_list = token_list[-(max_sequence_len - 1):]
  token_list = pad_sequences([token_list], maxlen = max_sequence_len - 1, padding = 'pre')
  predicted = model.predict(token_list, verbose = 0)
  predicted_word_index = np.argmax(predicted, axis = 1)
  for word, index in tokenizer.word_index.items():
    if index == predicted_word_index:
      return word
  return None

In [25]:
input_text = "let beleefe take hold of"
print(f"Input text: {input_text}")
max_sequence_len = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next word: {next_word}")

Input text: let beleefe take hold of
Next word: him
