In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/Abhuday/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [2]:
import pandas as pd
data=gutenberg.raw('shakespeare-hamlet.txt')

with open ('hamlet.txt','w') as file:
    file.write(data)

In [47]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

with open ('hamlet.txt','r') as file:
    text=file.read().lower()

In [50]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
tokenizer.word_index
total_words=len(tokenizer.word_index)+1

In [51]:
for i,line in enumerate(text.split('\n')):
    print(f"{line} line {i}")

[the tragedie of hamlet by william shakespeare 1599] line 0
 line 1
 line 2
actus primus. scoena prima. line 3
 line 4
enter barnardo and francisco two centinels. line 5
 line 6
  barnardo. who's there? line 7
  fran. nay answer me: stand & vnfold line 8
your selfe line 9
 line 10
   bar. long liue the king line 11
 line 12
   fran. barnardo? line 13
  bar. he line 14
 line 15
   fran. you come most carefully vpon your houre line 16
 line 17
   bar. 'tis now strook twelue, get thee to bed francisco line 18
 line 19
   fran. for this releefe much thankes: 'tis bitter cold, line 20
and i am sicke at heart line 21
 line 22
   barn. haue you had quiet guard? line 23
  fran. not a mouse stirring line 24
 line 25
   barn. well, goodnight. if you do meet horatio and line 26
marcellus, the riuals of my watch, bid them make hast. line 27
enter horatio and marcellus. line 28
 line 29
  fran. i thinke i heare them. stand: who's there? line 30
  hor. friends to this ground line 31
 line 32
   mar.

In [37]:
input_sequence=[]
for line in text.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    # print(token_list)
    for i in range(1,len(token_list)):
        n_gram_sequences=token_list[:i+1]
        # print(n_gram_sequences)
        input_sequence.append(n_gram_sequences)

In [41]:
max_sequence_len=max([len(i) for i in input_sequence])
max_sequence_len

14

In [43]:
input_sequence=pad_sequences(input_sequence,maxlen=max_sequence_len)
input_sequence

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [45]:
x,y=input_sequence[:,:-1],input_sequence[:,-1]
x

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4]], dtype=int32)

In [52]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)

In [54]:
len(y[0])

4818

In [69]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
earlystopping=EarlyStopping(monitor='val_loss',
                            patience=5,
                            restore_best_weights=True)

In [67]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Embedding,Dense,Dropout

model=Sequential()
model.add(Embedding(input_dim=total_words,
                    output_dim=100,
                    input_length=max_sequence_len))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(units=total_words,activation="softmax"))

model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])



In [68]:
model.summary()

In [70]:
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test))

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 76ms/step - accuracy: 0.0290 - loss: 7.1256 - val_accuracy: 0.0328 - val_loss: 6.7738
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 76ms/step - accuracy: 0.0386 - loss: 6.4307 - val_accuracy: 0.0389 - val_loss: 6.8550
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 95ms/step - accuracy: 0.0470 - loss: 6.3038 - val_accuracy: 0.0439 - val_loss: 6.8861
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 64ms/step - accuracy: 0.0504 - loss: 6.1577 - val_accuracy: 0.0480 - val_loss: 6.9078
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 57ms/step - accuracy: 0.0580 - loss: 5.9871 - val_accuracy: 0.0499 - val_loss: 6.9479
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 76ms/step - accuracy: 0.0621 - loss: 5.8480 - val_accuracy: 0.0573 - val_loss: 7.0179
Epoch 7/50
[1m6

In [78]:
def predict_next_word(model,tokenizer,text,max_sequence_len):
    token_list=tokenizer.texts_to_sequences([text])[0]
    if(len(token_list))>=max_sequence_len:
        token_list=token_list[-(max_sequence_len):]
    input_sequence=pad_sequences([token_list],maxlen=max_sequence_len)
    prediction=model.predict(input_sequence)
    predicted_word_index=np.argmax(prediction,axis=1)
    for word,index in tokenizer.word_index.items():
        if index==predicted_word_index:
            return word

In [81]:
text="The Souldiours Musicke, and the rites of"
max_sequence_len=model.input_shape[1]+1
ans=predict_next_word(model,tokenizer,text,max_sequence_len)
print(ans)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
warre


In [83]:
model.save("LSTM_NEXTWORD_PREDICTOR.h5")

import pickle

with open("tokenizer.pickle","wb") as handler:
    pickle.dump(tokenizer,handler,protocol=pickle.HIGHEST_PROTOCOL)

