## Data Collection-

In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import  pandas as pd

data=gutenberg.raw('shakespeare-hamlet.txt')

with open('hamlet.txt','w') as file:
    file.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/anshuljethani/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


## Data Preprocessing

In [40]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


with open('metamorphosis_clean.txt','r') as file:
    text=file.read().lower()

## Tokenize the text-creating indexes for words
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
total_words


2618

In [41]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'and': 3,
 'he': 4,
 'his': 5,
 'of': 6,
 'was': 7,
 'it': 8,
 'had': 9,
 'in': 10,
 'that': 11,
 'a': 12,
 'as': 13,
 'gregor': 14,
 'with': 15,
 'she': 16,
 'him': 17,
 'her': 18,
 'would': 19,
 'not': 20,
 'but': 21,
 'at': 22,
 'for': 23,
 'they': 24,
 'on': 25,
 'all': 26,
 'room': 27,
 'from': 28,
 'could': 29,
 'be': 30,
 'out': 31,
 'have': 32,
 'if': 33,
 'there': 34,
 'been': 35,
 "gregor's": 36,
 'so': 37,
 'father': 38,
 'sister': 39,
 'this': 40,
 'now': 41,
 'himself': 42,
 'door': 43,
 'then': 44,
 'back': 45,
 'mother': 46,
 'up': 47,
 'even': 48,
 'into': 49,
 'no': 50,
 'did': 51,
 'more': 52,
 'one': 53,
 'their': 54,
 'when': 55,
 'were': 56,
 'what': 57,
 'about': 58,
 'them': 59,
 'way': 60,
 'only': 61,
 'time': 62,
 'i': 63,
 'by': 64,
 'than': 65,
 'you': 66,
 'just': 67,
 'said': 68,
 'little': 69,
 'any': 70,
 'do': 71,
 'get': 72,
 'other': 73,
 'still': 74,
 'first': 75,
 'or': 76,
 'made': 77,
 'go': 78,
 'some': 79,
 'while': 80,
 's

#### Creating input-output sequences

In [42]:
input_sequences=[]
for line in text.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [43]:
input_sequences

[[1313, 139],
 [1313, 139, 55],
 [1313, 139, 55, 14],
 [1313, 139, 55, 14, 93],
 [1313, 139, 55, 14, 93, 935],
 [1313, 139, 55, 14, 93, 935, 28],
 [1313, 139, 55, 14, 93, 935, 28, 1314],
 [1313, 139, 55, 14, 93, 935, 28, 1314, 936],
 [1313, 139, 55, 14, 93, 935, 28, 1314, 936, 4],
 [1313, 139, 55, 14, 93, 935, 28, 1314, 936, 4, 241],
 [42, 1315],
 [42, 1315, 10],
 [42, 1315, 10, 5],
 [42, 1315, 10, 5, 112],
 [42, 1315, 10, 5, 112, 49],
 [42, 1315, 10, 5, 112, 49, 12],
 [42, 1315, 10, 5, 112, 49, 12, 721],
 [42, 1315, 10, 5, 112, 49, 12, 721, 1316],
 [42, 1315, 10, 5, 112, 49, 12, 721, 1316, 4],
 [42, 1315, 10, 5, 112, 49, 12, 721, 1316, 4, 159],
 [42, 1315, 10, 5, 112, 49, 12, 721, 1316, 4, 159, 25],
 [5, 1317],
 [5, 1317, 84],
 [5, 1317, 84, 45],
 [5, 1317, 84, 45, 3],
 [5, 1317, 84, 45, 3, 33],
 [5, 1317, 84, 45, 3, 33, 4],
 [5, 1317, 84, 45, 3, 33, 4, 447],
 [5, 1317, 84, 45, 3, 33, 4, 447, 5],
 [5, 1317, 84, 45, 3, 33, 4, 447, 5, 85],
 [5, 1317, 84, 45, 3, 33, 4, 447, 5, 85, 12],
 

#### Now applying pad sequencing to make every sentence is of the same length

In [44]:
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

17

In [45]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0, 1313,  139],
       [   0,    0,    0, ..., 1313,  139,   55],
       [   0,    0,    0, ...,  139,   55,   14],
       ...,
       [   0,    0,    0, ...,  770,   31,   18],
       [   0,    0,    0, ...,   31,   18,  930],
       [   0,    0,    0, ...,   18,  930,  131]], dtype=int32)

#### Creating predictors and label

In [46]:
import tensorflow as tf

In [47]:
x,y = input_sequences[:,:-1], input_sequences[:,-1]

In [48]:
x

array([[   0,    0,    0, ...,    0,    0, 1313],
       [   0,    0,    0, ...,    0, 1313,  139],
       [   0,    0,    0, ..., 1313,  139,   55],
       ...,
       [   0,    0,    0, ...,    3,  770,   31],
       [   0,    0,    0, ...,  770,   31,   18],
       [   0,    0,    0, ...,   31,   18,  930]], dtype=int32)

In [49]:
y

array([139,  55,  14, ...,  18, 930, 131], dtype=int32)

In [50]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [51]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### Splitting the data into -> Train and test datasets

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [54]:
## Train our LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

# Parameters
vocab_size = 10000  # example vocabulary size
embedding_dim = 128  # example embedding dimension
input_length = 100  # example input length (sequence length)
lstm_units = 150  # example LSTM units

## Define the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation="softmax"))

# #Compile the model
model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])

model.build((None , input_length))

model.summary()

#### Defining early stopping

In [55]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [81]:
history=model.fit(x_train,y_train, epochs=150, batch_size=64,validation_data=(x_test,y_test),verbose=1)

Epoch 1/150
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 40ms/step - accuracy: 0.3523 - loss: 2.7960 - val_accuracy: 0.1183 - val_loss: 8.2472
Epoch 2/150
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 44ms/step - accuracy: 0.3601 - loss: 2.7524 - val_accuracy: 0.1171 - val_loss: 8.2696
Epoch 3/150
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 42ms/step - accuracy: 0.3645 - loss: 2.7393 - val_accuracy: 0.1185 - val_loss: 8.3462
Epoch 4/150
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.3698 - loss: 2.7029 - val_accuracy: 0.1203 - val_loss: 8.3770
Epoch 5/150
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 41ms/step - accuracy: 0.3697 - loss: 2.6882 - val_accuracy: 0.1190 - val_loss: 8.4165
Epoch 6/150
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 40ms/step - accuracy: 0.3738 - loss: 2.6818 - val_accuracy: 0.1141 - val_loss: 8.4567
Epoch 7/15

In [82]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [92]:
input_text="Can we go in that "
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word Prediction: {next_word}")

Input text:Can we go in that 
Next Word Prediction: way


In [93]:
## Save the model
model.save("next_word_lstm.h5")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

