In [11]:
# Predict the next word using model LSTM with quote dataset

import pandas as pd
import numpy as np
import tensorflow 
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [12]:
# import the Quote dataset from csv file

df = pd.read_csv('Quotes Dataset.csv')

In [13]:
# now remove the insignificant columns required to predicting quotes input

df.drop(['Author','Number'], axis=1, inplace=True)

In [14]:
# store into array of quotes using np

doc = np.array(df['Quote'])

In [15]:
# tokenize all the words inside doc 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc)
sequences = tokenizer.texts_to_sequences(doc)
print(sequences)

[[2, 23, 46, 22, 10, 3, 115, 8, 115, 116], [2, 144, 106, 145, 1, 146], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1, 107, 27, 8, 2, 120, 121], [25, 8, 14, 147, 148, 8, 14, 149, 4, 8, 2, 150, 3, 151, 20, 152], [2, 23, 108, 3, 19, 122, 28, 8, 3, 109, 21, 1, 19], [24, 8, 21, 123, 153, 34, 154, 47, 155, 156], [1, 48, 49, 5, 2, 50, 1, 29, 51], [1, 23, 157, 124, 12, 30, 1, 19, 4, 158, 124, 8, 159], [35, 1, 7, 11, 34, 52, 53], [31, 32, 160, 11, 1, 31, 32, 105], [104, 161, 162, 13, 7, 163, 110, 15, 36, 164, 125, 24, 4, 165, 37], [2, 111, 108, 166, 8, 16, 167], [29, 168, 126, 38, 169, 170, 126, 4, 171], [9, 2, 31, 20, 1, 172, 3, 173, 104, 2, 105], [174, 8, 175, 127, 8, 6, 176, 128, 8, 6, 177, 178, 8, 54, 22, 179, 4, 2, 129], [13, 10, 14, 55, 36, 56, 57, 39, 58, 59, 20, 60, 28], [112, 8, 14, 107, 180, 181, 4, 182, 183, 32, 184, 185], [2, 23, 186, 3, 113, 187, 5, 127, 106, 9, 113, 188, 5, 128], [14, 15, 20, 8, 130, 7, 9, 131, 12, 114, 7, 9, 131, 40, 4, 8, 130], [2, 61, 5, 6, 62, 63, 64, 41,

In [16]:
# make dataset like supervised learning input and putput
dataset = []
for i in sequences:
  for j in range(1, len(i)):
    dataset.append(i[:j+1])
print(dataset)

max_len = max([len(x) for x in dataset])
print(max_len)

[[2, 23], [2, 23, 46], [2, 23, 46, 22], [2, 23, 46, 22, 10], [2, 23, 46, 22, 10, 3], [2, 23, 46, 22, 10, 3, 115], [2, 23, 46, 22, 10, 3, 115, 8], [2, 23, 46, 22, 10, 3, 115, 8, 115], [2, 23, 46, 22, 10, 3, 115, 8, 115, 116], [2, 144], [2, 144, 106], [2, 144, 106, 145], [2, 144, 106, 145, 1], [2, 144, 106, 145, 1, 146], [3, 9], [3, 9, 26], [3, 9, 26, 104], [3, 9, 26, 104, 6], [3, 9, 26, 104, 6, 105], [3, 9, 26, 104, 6, 105, 20], [3, 9, 26, 104, 6, 105, 20, 8], [3, 9, 26, 104, 6, 105, 20, 8, 117], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1, 107], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1, 107, 27], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1, 107, 27, 8], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1, 107, 27, 8, 2], [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1, 107,

In [17]:
# Zero padding to all set of different vecotr size inputs 
dataset = np.array(pad_sequences(dataset, maxlen= max_len, padding='pre'))
print(dataset)

[[ 0  0  0 ...  0  2 23]
 [ 0  0  0 ...  2 23 46]
 [ 0  0  0 ... 23 46 22]
 ...
 [ 0  0  0 ... 67 45 40]
 [ 0  0  0 ... 45 40 38]
 [ 0  0  0 ... 40 38 68]]


In [18]:
input = dataset[:, :-1]
output = dataset[:, -1]
output.shape

(7707,)

In [19]:
# one hot encoding of output data
from keras.utils import to_categorical

In [20]:
output = to_categorical(output, num_classes=len(tokenizer.word_index)+1)
print(output.shape)
print(output[0])

(7707, 250)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [21]:
# Make architecture of 3 layer Embedding , LSTM and Dense 

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_len-1))
model.add(LSTM(150))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))


2024-09-12 15:36:41.801978: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.summary()

In [24]:
print(input)
print(input.shape)

[[ 0  0  0 ...  0  0  2]
 [ 0  0  0 ...  0  2 23]
 [ 0  0  0 ...  2 23 46]
 ...
 [ 0  0  0 ... 16 67 45]
 [ 0  0  0 ... 67 45 40]
 [ 0  0  0 ... 45 40 38]]
(7707, 23)


In [25]:
model.fit(input, output, epochs=100)

Epoch 1/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.0905 - loss: 4.5718
Epoch 2/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.6781 - loss: 1.7992
Epoch 3/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9175 - loss: 0.6337
Epoch 4/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9334 - loss: 0.3759
Epoch 5/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9396 - loss: 0.3048
Epoch 6/100
[1

<keras.src.callbacks.history.History at 0x7f9f1fd9ad70>

In [36]:
text = "I like"

# tokenize and padding
token_text = tokenizer.texts_to_sequences([text])
print(token_text)
token_text = pad_sequences(token_text, maxlen=max_len-1, padding='pre')
print(token_text)

[[13]]
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 13]]


In [37]:
pred = model.predict(token_text)
pred.shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


(1, 250)

In [38]:
word_pos  = np.argmax(pred)

for word, index in tokenizer.word_index.items():
  if index == word_pos:
    print(word)

can't


In [39]:
for i in range(20):
  token_text = tokenizer.texts_to_sequences([text])
  token_text = pad_sequences(token_text, maxlen=max_len-1, padding='pre')
  pred = model.predict(token_text)
  word_pos  = np.argmax(pred)

  for word, index in tokenizer.word_index.items():
    if index == word_pos:
      text = text + " " + word
      print(word)
      break
  print(text)

  

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
can't
I like can't
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
change
I like can't change
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
the
I like can't change the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
direction
I like can't change the direction
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
of
I like can't change the direction of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
the
I like can't change the direction of the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
wind
I like can't change the direc