In [None]:
import nltk
import re
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
# load the dataset
data=gutenberg.raw('shakespeare-hamlet.txt')
# save the dataset
with open('hamlet.txt','w') as file:
  file.write(data)
  # import dataset in reading mode
with open('hamlet.txt','r') as file:
  faqs=file.read().lower()
#  apply regular expression
cleaned_text=re.sub('[^a-zA-Z]',' ',faqs)
cleaned_text=re.sub(r'\s+',' ', cleaned_text)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([faqs])

In [None]:
vocab_size=len(tokenizer.word_index)+1
vocab_size

In [None]:
input_sequences = []
for sentence in faqs.split('.'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
  if len(tokenized_sentence) < 2:
        continue
  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [None]:
input_sequences

In [None]:
max_len = max([len(x) for x in input_sequences])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [None]:
padded_input_sequences

In [None]:
x = padded_input_sequences[:,:-1]

In [None]:
y = padded_input_sequences[:,-1]

In [None]:
x.shape

In [None]:
y.shape

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
y = to_categorical(y,num_classes=vocab_size)

In [None]:
y.shape

In [None]:
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
vocab_size

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,shuffle=True)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
earlystoping=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=56))
model.add(LSTM(150,return_sequences=True))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:

model.summary()

In [None]:
model.fit(x,y,validation_data=(x_test,y_test),epochs=100,callbacks=[earlystoping])

In [None]:
import time
import numpy as np
text = """
ar. Horatio saies, 'tis but our Fantasie,
And will not let beleefe take hold of him
Touching this dreaded sight, twice seene of vs,
Therefore I haue intreated him along
With vs, to watch the minutes of this Night,
That if againe this Apparition """

for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

In [52]:
from google.colab import drive
drive.mount('/content/drive')

# Save to your Drive path
model.save('/content/drive/MyDrive/LSTM_Next_Word_Prediction_Model.keras')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.listdir('/content/drive/MyDrive')


In [None]:
import pickle
with open('/content/drive/MyDrive/tokenizer.pkl','wb') as file:
  pickle.dump(tokenizer,file)

In [53]:
import os

for f in os.listdir('/content/drive/MyDrive'):
    print(f)


1661-0.txt
Colab Notebooks
LSTM_Next_Word_Prediction_Model.h5
tokenizer
tokenizer.pkl
LSTM_Next_Word_Prediction_Model.keras
