In [12]:
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('punkt')

# Load the Hamlet text
#text = gutenberg.raw('shakespeare-hamlet.txt')
text = gutenberg.raw('austen-sense.txt')[:10000]

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
#Data processing
# Basic cleanup \r character takes the cursor to the beginning of the line, not to the next line.
text = text.lower().replace('\n', ' ').replace('\r', ' ')
tokenizer = Tokenizer()
#Neural Networks work on Numbers not words. Hence, each word in a sentence is assigned an index number.
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print("Total unique words:", total_words)

Total unique words: 581


In [14]:
input_sequences = []
# say the sentence was "to be or not to be". tokenizer.fit_to_texts() gave the word index dictionary {to:1,be:2,or:3,not:4}.
# Now, Tokenizer.texts_to_sequences will give a list of the index for the text [1,2,3,4,1,2]
token_list = tokenizer.texts_to_sequences([text])[0]

# Create sequences like [w1, w2, w3], ... At least 3 words should be there so that there are minimum 2 words to predict the third.
for i in range(2, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [15]:
# Pad the sequences to have same length
max_seq_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
#Input_sequence has the datatype array here.

In [16]:
# Split into input (X) and output (label y)
X = input_sequences[:, :-1] #All rows and all columns except the last column
labels = input_sequences[:, -1] #Only the last column
y = to_categorical(labels, num_classes=total_words) #Convert the words into classes.

In [17]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=max_seq_len-1),
    tf.keras.layers.LSTM(150),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [19]:
from tensorflow.keras.callbacks import EarlyStopping

# Define EarlyStopping
early_stop = EarlyStopping(
    monitor='val_loss',       # Watch validation loss
    patience=5,               # Wait 5 epochs for improvement
    restore_best_weights=True  # Restore weights from the best epoch
)

In [20]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    verbose=1,
    callbacks=[early_stop]
)


Epoch 1/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 2s/step - accuracy: 0.0230 - loss: 6.2249 - val_accuracy: 0.0343 - val_loss: 5.9445
Epoch 2/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 2s/step - accuracy: 0.0352 - loss: 5.4905 - val_accuracy: 0.0371 - val_loss: 6.1372
Epoch 3/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 2s/step - accuracy: 0.0450 - loss: 5.3628 - val_accuracy: 0.0343 - val_loss: 6.2370
Epoch 4/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 2s/step - accuracy: 0.0462 - loss: 5.3387 - val_accuracy: 0.0400 - val_loss: 6.2569
Epoch 5/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 2s/step - accuracy: 0.0644 - loss: 5.2131 - val_accuracy: 0.0343 - val_loss: 6.1710
Epoch 6/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 2s/step - accuracy: 0.0473 - loss: 5.1126 - val_accuracy: 0.0600 - val_loss: 6.2028


In [21]:
# Save the model
model.save('next_word_model.h5')
# Save the tokenizer into a pickle file
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

