In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard,EarlyStopping
from sklearn.model_selection import train_test_split

In [2]:
!pip install numpy
!pip install pandas
!pip install tensorboard
!pip install scikeras
!pip install nltk


Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting scikit-learn>=1.4.2 (from scikeras)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, scikeras
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikeras-0.13.0 scikit-learn-1.5.1


# **Data Collection**

In [3]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
data = gutenberg.raw('shakespeare-hamlet.txt')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [4]:
with open('hamlet.txt', 'w') as f:
    f.write(data)

In [5]:
with open('hamlet.txt','r') as file:
    text = file.read().lower()
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) +1

In [6]:
total_words

4818

In [7]:
## Create input sequences
input_sequences =  []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
max_length = max([len(x) for x in input_sequences])
print(max_length)
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_length,padding='pre'))
input_sequences[:10]

14


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    1,  687],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           1,  687,    4],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
         687,    4,   45],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    1,  687,
           4,   45,   41],
       [   0,    0,    0,    0,    0,    0,    0,    0,    1,  687,    4,
          45,   41, 1886],
       [   0,    0,    0,    0,    0,    0,    0,    1,  687,    4,   45,
          41, 1886, 1887],
       [   0,    0,    0,    0,    0,    0,    1,  687,    4,   45,   41,
        1886, 1887, 1888],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 1180, 1889],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        1180, 1889, 1890],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1180,
        1889, 189

In [9]:
## Creat Prediction and Label
X = input_sequences[:,:-1]
y = input_sequences[:,-1]

In [10]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
## Train Our LSTM

from tensorflow.keras.models import Sequential


In [13]:
model =  Sequential()
model.add(Embedding(total_words,100,input_length=max_length-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))
model.build(input_shape=(None,max_length-1))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()



In [17]:
history = model.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),verbose=1)

Epoch 1/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1246 - loss: 4.7408 - val_accuracy: 0.0628 - val_loss: 7.7299
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.1268 - loss: 4.6328 - val_accuracy: 0.0631 - val_loss: 7.8621
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.1338 - loss: 4.4988 - val_accuracy: 0.0620 - val_loss: 7.9628
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.1419 - loss: 4.3810 - val_accuracy: 0.0635 - val_loss: 8.1012
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.1541 - loss: 4.2748 - val_accuracy: 0.0612 - val_loss: 8.2839
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.1605 - loss: 4.1733 - val_accuracy: 0.0581 - val_loss: 8.4035
Epoch 7/100
[1m6

In [23]:
def prediction_model(model,tokenizer,input,max_seq_len):
  tokenizer = tokenizer
  input = input.lower()
  token_list = tokenizer.texts_to_sequences([input])[0]
  token_list = pad_sequences([token_list],maxlen=max_seq_len-1,padding='pre')
  predicted = model.predict(token_list,verbose=0)
  predicted = np.argmax(predicted,axis=1)
  output_word = ''
  for word,index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  return output_word

In [28]:
input_text = 'if your age is 70 its mean you are'
max_seq =model.input_shape[1] + 1
next_word = prediction_model(model,tokenizer,input_text,max_length)
print(f'Next Word Prediction: {next_word}')

Next Word Prediction: made


In [29]:
model.save('lstm_model.h5')
import pickle
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

