<a href="https://colab.research.google.com/github/adityaxgoswami/LSTM-RNN/blob/main/LSTM_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# %%
import pandas as pd
import nltk
nltk.download('gutenberg')

# %%
from nltk.corpus import gutenberg
data=gutenberg.raw('shakespeare-hamlet.txt')

#saving the file
with open('hamlet.txt','w') as file:
    file.write(data)

# %% [markdown]
# DATA PREPROCESSING

# %%
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# %%
tokenizer=Tokenizer()

with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

# %% [markdown]
# applying the tokenizer from tensorflow to the while data

# %%
tokenizer.fit_on_texts([text])

# %% [markdown]
# calculating the total len or count of the words using the .word_index method and adding 1 bc indexing start with 0

# %%
total_words=len(tokenizer.word_index)+1

# %%
print(total_words)

# %% [markdown]
# input sequences - we need to split the dataset into input and output sequences for making our model trains
#
#

# %%
input_sequence=[]
for line in text.split('\n'): #splits every line from the text dataset
    token_list=tokenizer.texts_to_sequences([line])[0] #convetes into seq of numerical tokens
    for i in range(len(token_list)):
        n_grams_sequence = token_list[:i+1]  # takes elements from 0 to index i
        input_sequence.append(n_grams_sequence)

# %%
input_sequence

# %% [markdown]
# NOW WE WANT TO APPLY PADDING TO THE INP_SEQ GENERATED TO MAKE THEM IN SAME LENGTH

# %%
#FOR PADDING WE NEED THE MAX LENGTH
max_length = max([len(x) for x in input_sequence])
max_length

# %%
input_sequence=np.array(pad_sequences(input_sequence,maxlen=max_length))
#convertign to numpy arrays becoz ml or dl models work best in numpy arry not list

# %%
import tensorflow as tf
x,y=input_sequence[:,:-1],input_sequence[:,-1]

# %%
y

# %% [markdown]
# the place where the y is present is only set to 1

# %%
y=tf.keras.utils.to_categorical(y,num_classes=total_words)

# %%
y

# %%
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.29)

# %%
#TRAIN OUT LSTM RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,Dropout

# %%
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_length-1)) #100 dim and input same as inputs
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2)) #drop 20% of the neurons
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

#COMPILING THE MODEL
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

# %%
model.build((None,max_length))

# %%
model.summary()

# %%
x_train.shape

# %%
history = model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)

# %%


# %%


# %%





[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


4818




Epoch 1/50
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 65ms/step - accuracy: 0.0324 - loss: 7.0960 - val_accuracy: 0.0310 - val_loss: 6.6515
Epoch 2/50
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 65ms/step - accuracy: 0.0391 - loss: 6.3801 - val_accuracy: 0.0468 - val_loss: 6.6880
Epoch 3/50
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 66ms/step - accuracy: 0.0460 - loss: 6.2217 - val_accuracy: 0.0536 - val_loss: 6.7391
Epoch 4/50
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 74ms/step - accuracy: 0.0578 - loss: 6.0667 - val_accuracy: 0.0570 - val_loss: 6.7172
Epoch 5/50
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 65ms/step - accuracy: 0.0579 - loss: 5.9273 - val_accuracy: 0.0565 - val_loss: 6.7261
Epoch 6/50
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 63ms/step - accuracy: 0.0657 - loss: 5.7786 - val_accuracy: 0.0605 - val_loss: 6.7922
Epoch 7/50
[1m6

In [2]:
def predict_next_word(model,tokenizer,text,max_length):
    token_list=tokenizer.texts_to_sequences([text])[0]
    if len(token_list)>=max_length:
      token_list = token_list[-(max_length-1):]
    token_list=pad_sequences([token_list],maxlen=max_length-1)
    predicted=model.predict(token_list,verbose=0)
    predicted_word=np.argmax(predicted,axis=1)
    for word,index in tokenizer.word_index.items():
      if index==predicted_word:
        return word
    return None

In [4]:
input_text="To be or not to be"
print(f"Input text: {input_text}")
max_length=model.input_shape[1]+1
predicted_word=predict_next_word(model,tokenizer,input_text,max_length)
print(f"Predicted word: {predicted_word}")

Input text: To be or not to be
Predicted word: so


In [6]:
from typing import Protocol
import pickle
model.save("model.h5")

with open("tokenizer.pkl",'wb') as file:
  pickle.dump(tokenizer,file,protocol=pickle.HIGHEST_PROTOCOL)

