In [1]:
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from nltk.corpus import stopwords

### Data Preprocessing
- Remove all stop words to just train on the words
- Tokenize the text into words.
- Create a vocabulary of unique words and map each word to an integer.
- Convert the text into sequences of integers representing words.
- Split the text into overlapping sequences of fixed length (e.g., 5 words per sequence).
- For each sequence, the target will be the next word in the sequence.	-> 20

In [1]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in STOPWORDS])

In [19]:
def data_processing():
    with open('Shakespeare_plays_dataset/t.txt', 'r') as file:
        text = file.read().lower()
    new_text = remove_stopwords(text)
    return new_text
cleaned_text = data_processing()

In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_text])       
total_words = len(tokenizer.word_index) + 1  # Total no,of unique words
word_index = tokenizer.word_index            # Results a dictionary where each word is mapped to a unique index (starting from 1)

In [21]:
# Create sequences of words (n-grams)
input_sequences = []
for line in cleaned_text.split('.'):  # Splitting by sentences
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# print(input_sequences)

In [22]:
max_sequence_len = 5
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

### Selection of Suitable Model Architecture and Training the Model with suitable metrics	


- `X (Features/Words)`: This contains all but the last word in each input sequence. In a sequence of words, X serves as the input for the model, which the model will learn from.
- `y (Targets)`: This contains the last word in each input sequence, which serves as the target or the expected output for the model to predict.

By setting up X and y this way, you enable the model to learn the relationship between the context (the preceding words) and the prediction (the next word). <br>

**One-Hot Encoding**: This converts the target labels (which are integers representing words) into a binary matrix. Each row corresponds to a word, where a 1 indicates the presence of the word at that index, and 0s elsewhere.

In [10]:
import pandas as pd 

# For each sequence, the target will be the next word in the sequence
X, y = input_sequences[:, :-1], input_sequences[:, -1]  # Features and targets
y = pd.get_dummies(y).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print("Training data shape:", X_train.shape)
# print("Training labels shape:", y_train.shape)
# print("Testing data shape:", X_test.shape)
# print("Testing labels shape:", y_test.shape)

In [15]:
print(y_train.shape)
print(y_test.shape)
print(total_words)

(111726, 13886)
(27932, 13886)
13993


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# Define the model
# A Sequential model is initialized, which allows us to build the neural network layer by layer.
model = Sequential()

# Add embedding layer (converts word indices to dense vectors)
model.add(Embedding(input_dim = total_words, output_dim = 300, input_length=max_sequence_len - 1))

# Add LSTM layer
model.add(LSTM(150, return_sequences=False))
model.add(Dropout(0.3))  # Prevent overfitting by randomly droping 30% of the neurons 

# Add Dense layer with softmax for next word prediction
# Dense layer - number of neurons equal to the total number of unique words
model.add(Dense(y_test.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Add early stopping to stop training when validation accuracy stops improving
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)


# Training the model
history = model.fit(X_train, y_train, epochs=35, batch_size=25, validation_data=(X_test, y_test), verbose=1)

Epoch 1/35
[1m4470/4470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 72ms/step - accuracy: 0.0166 - loss: 8.0998 - val_accuracy: 0.0215 - val_loss: 7.8308
Epoch 2/35
[1m4470/4470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 69ms/step - accuracy: 0.0267 - loss: 7.5603 - val_accuracy: 0.0308 - val_loss: 7.8269
Epoch 3/35
[1m4470/4470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m310s[0m 69ms/step - accuracy: 0.0407 - loss: 7.2058 - val_accuracy: 0.0373 - val_loss: 7.8764
Epoch 4/35
[1m4470/4470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 72ms/step - accuracy: 0.0573 - loss: 6.7147 - val_accuracy: 0.0407 - val_loss: 8.0222
Epoch 5/35
[1m4470/4470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 76ms/step - accuracy: 0.0915 - loss: 6.1347 - val_accuracy: 0.0405 - val_loss: 8.2651
Epoch 6/35
[1m4470/4470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 69ms/step - accuracy: 0.1369 - loss: 5.5389 - val_accuracy: 0.0392 - val_loss: 8.521

- Saving the model for later use and creating pickle files to link the model to the User Interface

In [59]:
model.save('lstm_model.h5')



In [61]:
from tensorflow.keras.models import load_model
model = load_model('lstm_model.h5')



In [63]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [68]:
def predict_next_word(model, tokenizer, seed_text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted = np.argmax(predicted_probs, axis=-1)
    
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            return word
    return ""


seed_text = "Hello, my name could revenge you"

next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len=6)
print(f"Next word: {next_word}")


Next word: yet


### Bonus: Implement the LSTM model in such a way that it predict next few words (3) instead of just one word

In [65]:
def predict_next_words(model, tokenizer, seed_text, max_sequence_len, n_words=3):
    result = []
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)
        
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                next_word = word
                break
        result.append(next_word)
        seed_text += " " + next_word
    return result

# Example usage
next_words = predict_next_words(model, tokenizer, "Hello, my name", max_sequence_len=6, n_words=3)
print("Next words:", next_words)


Next words: ['could', 'revenge', 'you']
