In [23]:
## Data Collection
# Gutenberg corpus = a collection of classic books and plays used for NLP practice.(inside that there is a book called Shakespeare’s Hamlet)
# import nltk
# nltk.download('gutenberg')  #Downloads the Gutenberg corpus from NLTK’s online resources,You only need to run this once, and NLTK will save it locally.
# from nltk.corpus import gutenberg
# import  pandas as pd

# ## load the dataset
# data=gutenberg.raw('shakespeare-hamlet.txt')  #gives you the entire text of Hamlet as a single string.(including points,!,? all).

# ## save to a file
# with open('hamlet.txt','w') as file:
#     file.write(data)

In [24]:
## Data Preprocessing

import numpy as np
from keras.preprocessing.text import Tokenizer   #Tokenizer-->paragraph to sentence or word
from keras.preprocessing.sequence import pad_sequences  #pad_sequences-->making all sentence of same length
from sklearn.model_selection import train_test_split

##load the dataset
with open('hamlet.txt','r') as file:
    text=file.read().lower()

## Tokenize the text-creating indexes for words
tokenizer=Tokenizer()  
tokenizer.fit_on_texts([text])   #converting the word into number based on  assigns smaller index to more frequent word--> and it got stored in--> tokenizer.word_index
 
print(tokenizer.word_index)

tokenizer.word_index
total_words=len(tokenizer.word_index)+1   #index 0 is reserved for padding,and actual words start from index 1.
total_words



{'the': 1, 'and': 2, 'a': 3, 'to': 4, 'of': 5, 'in': 6, 'is': 7, 'with': 8, 'that': 9, 'he': 10, 'was': 11, 'her': 12, 'for': 13, 'his': 14, 'are': 15, 'on': 16, 'she': 17, 'from': 18, 'abhay': 19, 'as': 20, 'than': 21, 'every': 22, 'its': 23, 'india': 24, 'it': 25, 'about': 26, 'like': 27, 'but': 28, 'forest': 29, 'hidden': 30, 'courage': 31, 'world': 32, 'him': 33, 'wisdom': 34, 'were': 35, 'stories': 36, 'into': 37, 'people': 38, 'had': 39, 'very': 40, 'new': 41, 'their': 42, 'through': 43, 'animals': 44, 'who': 45, 'at': 46, 'learning': 47, 'can': 48, 'one': 49, 'by': 50, 'small': 51, 'not': 52, 'knowledge': 53, 'life': 54, 'light': 55, 'an': 56, 'many': 57, 'has': 58, 'also': 59, 'sky': 60, 'have': 61, 'ancient': 62, 'stars': 63, 'adventure': 64, 'village': 65, 'over': 66, 'water': 67, 'be': 68, 'magical': 69, 'selina': 70, 'night': 71, 'curiosity': 72, 'day': 73, 'elias': 74, 'time': 75, 'kindness': 76, 'arin': 77, 'kiran': 78, 'long': 79, 'children': 80, 'became': 81, 'tree': 82

3027

In [25]:
## create inoput sequences
input_sequences=[]  #empty Python list
for line in text.split('\n'):   #split the text into sentences based on "\n", wherever "\n-->newLine" comes then split the sentence
    token_list=tokenizer.texts_to_sequences([line])[0]    #Convert words into numbers using tokenizer.(it is a 2D array for getting 1D we are using [0] element 1 or 2d array)
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]  #n-grams help the LSTM understand patterns in sequences, and predict the next word.
        input_sequences.append(n_gram_sequence)

        

In [26]:
input_sequences

[[207, 47],
 [207, 47, 7],
 [207, 47, 7, 3],
 [207, 47, 7, 3, 965],
 [207, 47, 7, 3, 965, 5],
 [207, 47, 7, 3, 965, 5, 382],
 [207, 47, 7, 3, 965, 5, 382, 47],
 [207, 47, 7, 3, 965, 5, 382, 47, 9],
 [207, 47, 7, 3, 965, 5, 382, 47, 9, 679],
 [207, 47, 7, 3, 965, 5, 382, 47, 9, 679, 454],
 [207, 47, 7, 3, 965, 5, 382, 47, 9, 679, 454, 455],
 [1514, 454],
 [1514, 454, 455],
 [1514, 454, 455, 15],
 [1514, 454, 455, 15, 95],
 [1514, 454, 455, 15, 95, 13],
 [1514, 454, 455, 15, 95, 13, 1515],
 [1514, 454, 455, 15, 95, 13, 1515, 274],
 [1514, 454, 455, 15, 95, 13, 1515, 274, 2],
 [1514, 454, 455, 15, 95, 13, 1515, 274, 2, 75],
 [1514, 454, 455, 15, 95, 13, 1515, 274, 2, 75, 1516],
 [1514, 454, 455, 15, 95, 13, 1515, 274, 2, 75, 1516, 966],
 [1517, 455],
 [1517, 455, 15],
 [1517, 455, 15, 113],
 [1517, 455, 15, 113, 13],
 [1517, 455, 15, 113, 13, 680],
 [1517, 455, 15, 113, 13, 680, 79],
 [1517, 455, 15, 113, 13, 680, 79, 1518],
 [1517, 455, 15, 113, 13, 680, 79, 1518, 1519],
 [1517, 455, 15,

In [27]:
## Pad Sequences
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

66

In [28]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,  207,   47],
       [   0,    0,    0, ...,  207,   47,    7],
       [   0,    0,    0, ...,   47,    7,    3],
       ...,
       [   0,    0,    0, ...,    1, 3026,   16],
       [   0,    0,    0, ..., 3026,   16,    1],
       [   0,    0,    0, ...,   16,    1,  964]])

In [29]:
##create predicitors and label
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]   #[:,:-1]-->Take every word except last word and [:,-1]--> Take the last word only

In [30]:
x

array([[   0,    0,    0, ...,    0,    0,  207],
       [   0,    0,    0, ...,    0,  207,   47],
       [   0,    0,    0, ...,  207,   47,    7],
       ...,
       [   0,    0,    0, ...,    6,    1, 3026],
       [   0,    0,    0, ...,    1, 3026,   16],
       [   0,    0,    0, ..., 3026,   16,    1]])

In [31]:
y

array([ 47,   7,   3, ...,  16,   1, 964])

In [32]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)   # jaha jaha par bhi y  words hoga vocabolry ke size mai waha bass 1 rahega baki sari jghe 0 bana dega --->eg if y = [2, 4]  num_class=6(total vocabolary size) then[[0,0,1,0,0,0], # word index 2  and [0,0,0,0,1,0]    # word index 4]
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [33]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [34]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [35]:
## Train our LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

## Define the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))  #removing some features from the model(20 percent)
model.add(LSTM(100))
model.add(Dense(total_words,activation="softmax"))

# #Compile the model
model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 65, 100)           302700    
                                                                 
 lstm_2 (LSTM)               (None, 65, 150)           150600    
                                                                 
 dropout_1 (Dropout)         (None, 65, 150)           0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               100400    
                                                                 
 dense_1 (Dense)             (None, 3027)              305727    
                                                                 
Total params: 859427 (3.28 MB)
Trainable params: 859427 (3.28 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)  #Verbose = How much details to show on screen if verbose=1 (Shows training progress),if verbose=0(no output show)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [37]:
# # GRU RNN
# # Define the model
# model=Sequential()
# model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
# model.add(GRU(150,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(GRU(100))
# model.add(Dense(total_words,activation="softmax"))

# # #Compile the model
# model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
# model.summary()

In [38]:
# # Train the model
# history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1,callbacks=[early_stopping])


In [43]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]  #Converts the input text into a list of integers eg --->text = "I love" token_list = [1,2]  
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)  #verbose=0 → turns off printing progress.
    predicted_word_index = np.argmax(predicted, axis=1)  #Finds the index of the word with the highest probability.
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [48]:
input_text="shark is a  "
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

Input text:shark is a  
Next Word PRediction:fish


In [41]:
## Save the model
model.save("next_word_lstm.h5")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
input_text="Last night was a"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

Input text:Last night was a
Next Word PRediction:better
