In [1]:
!pip install nltk



In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
import numpy as np

data = gutenberg.raw('shakespeare-hamlet.txt')

with open('hamlet.txt', 'w') as file:
    file.write(data)




[nltk_data] Downloading package gutenberg to C:\Users\ankush kumar
[nltk_data]     singh\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

Tokenizerizer = Tokenizer()
Tokenizerizer.fit_on_texts([text])
total_words = len(Tokenizerizer.word_index) + 1

total_words

4818

In [6]:
# create sequence fromthe corpus of  the text 
input_sequences = []
for line in text.split('\n'):
    token_list = Tokenizerizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

        

In [8]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [7]:
input_sequences = pad_sequences(input_sequences, maxlen=50, padding='pre')      

In [9]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [35]:
import tensorflow as tf
x,y = input_sequences[:,:-1], input_sequences[:,-1]

In [36]:
max_len = max([len(x) for x in input_sequences])
max_len

50

In [37]:
x


array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4]], dtype=int32)

In [38]:
y

array([ 687,    4,   45, ..., 1047,    4,  193], dtype=int32)

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [40]:
# now the main part where we will train the LSTM model 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional,Dropout

# define the model as per the model architecture thet we want to use to train our LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_len))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))


In [41]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
model.build((None, x.shape[1]))
model.summary()

In [46]:
history=model.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test),verbose=1)


Epoch 1/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 98ms/step - accuracy: 0.4103 - loss: 2.6821 - val_accuracy: 0.0507 - val_loss: 11.4692
Epoch 2/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 116ms/step - accuracy: 0.4204 - loss: 2.6358 - val_accuracy: 0.0507 - val_loss: 11.5927
Epoch 3/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 82ms/step - accuracy: 0.4260 - loss: 2.6112 - val_accuracy: 0.0536 - val_loss: 11.6459
Epoch 4/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 119ms/step - accuracy: 0.4281 - loss: 2.5787 - val_accuracy: 0.0513 - val_loss: 11.7026
Epoch 5/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 58ms/step - accuracy: 0.4383 - loss: 2.5467 - val_accuracy: 0.0503 - val_loss: 11.8175
Epoch 6/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 103ms/step - accuracy: 0.4460 - loss: 2.5117 - val_accuracy: 0.0511 - val_loss: 11.8569
Epoch 7

In [47]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [48]:
# function to predict the next word according to the exact target we want to achieve


def predict_next_word(model, tokenizer, text_sequence, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text_sequence])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [None]:
# text->1
input_text = "to be or not to"
print(input_text)
max_seq = model.input_shape[1]+1
next1_word = predict_next_word(model, Tokenizerizer, input_text, max_seq)
print(f"Input Text: {input_text}\nPredicted Next Word: {next1_word}")

to be or not to
Input Text: to be or not to
Predicted Next Word: hold


In [53]:
# save the trained LSTM model

model.save('next_word_lstm_model.h5')

# save in the file
import pickle
with open('tokenizer_hamlet.pickle', 'wb') as handle:
    pickle.dump(Tokenizerizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [54]:
# text->2 
input_text = "prevention is better than"
print(input_text)
max_seq = model.input_shape[1]+1
next1_word = predict_next_word(model, Tokenizerizer, input_text, max_seq)

print(f"Input Text: {input_text}\nPredicted Next Word: {next1_word}")


prevention is better than
Input Text: prevention is better than
Predicted Next Word: a


In [55]:
# text->3
input_text = "all that glitters is not"
print(input_text)
max_seq = model.input_shape[1]+1
next1_word = predict_next_word(model, Tokenizerizer, input_text, max_seq)
print(f"Input Text: {input_text}\nPredicted Next Word: {next1_word}")


all that glitters is not
Input Text: all that glitters is not
Predicted Next Word: beteene
