In [139]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [140]:
df = pd.read_csv("quotes_dataset.csv")

In [141]:
df.head()

Unnamed: 0,Number,Quote,Author
0,1,The only thing we have to fear is fear itself.,Franklin D. Roosevelt
1,2,The truth will set you free.,The Bible
2,3,To be yourself in a world that is constantly t...,Ralph Waldo Emerson
3,4,"Success is not final, failure is not fatal: It...",Winston S. Churchill
4,5,The only way to do great work is to love what ...,Steve Jobs


In [142]:
quotes = df['Quote']

In [143]:
quotes.head()

0       The only thing we have to fear is fear itself.
1                         The truth will set you free.
2    To be yourself in a world that is constantly t...
3    Success is not final, failure is not fatal: It...
4    The only way to do great work is to love what ...
Name: Quote, dtype: object

In [144]:
quotes.str.lower()

0         the only thing we have to fear is fear itself.
1                           the truth will set you free.
2      to be yourself in a world that is constantly t...
3      success is not final, failure is not fatal: it...
4      the only way to do great work is to love what ...
                             ...                        
720            believe you can and you're halfway there.
721    the mind is everything. what you think you bec...
722    i have not failed. i've just found 10,000 ways...
723    a journey of a thousand miles begins with a si...
724          it always seems impossible until it's done.
Name: Quote, Length: 725, dtype: object

In [145]:
import string
translator = str.maketrans('' , '' , string.punctuation)

In [None]:
quotes = quotes.apply(lambda x : x.translate(translator))

In [147]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [148]:
vocab_size = 248
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(quotes)

In [149]:
word_index = tokenizer.word_index
print(len(word_index))

248


In [150]:
sequence = tokenizer.texts_to_sequences(quotes)


In [None]:
X = []
y = []
for seq in sequence:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[1])



In [152]:
len(X)

7655

In [153]:
max_len = max(len(x) for x in X)
max_len

23

In [154]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_padded = pad_sequences(X , maxlen=max_len , padding='pre')

In [155]:
y = np.array(y)

In [156]:
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(y ,num_classes= vocab_size)

In [157]:
y_one_hot.shape

(7655, 248)

In [158]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding  , SimpleRNN , Dense , LSTM

In [159]:
embedding_dim = 50
rnn_units = 128


In [160]:
rnn_model = Sequential()
rnn_model.add(
    Embedding(input_dim=vocab_size,output_dim=embedding_dim, input_length=max_len)
)
rnn_model.add(
    SimpleRNN(units=rnn_units)
)
rnn_model.add(
    Dense(units=vocab_size , activation='softmax')
)



In [161]:
rnn_model.compile(optimizer='adam' , loss='categorical_crossentropy' , metrics=['accuracy'])

In [162]:
rnn_model.summary()

In [163]:
lstm_model = Sequential()
lstm_model.add(
    Embedding(input_dim=vocab_size,output_dim=embedding_dim, input_length=max_len)
)
lstm_model.add(
    LSTM(units=rnn_units)
)
lstm_model.add(
    Dense(units=vocab_size , activation='softmax')
)

In [164]:
lstm_model.compile(optimizer='adam' , loss='categorical_crossentropy' , metrics=['accuracy'])

In [165]:
lstm_model.summary()

In [166]:
epochs = 50
batch_size = 128


In [167]:
history_rnn = rnn_model.fit(X_padded, y_one_hot , epochs=epochs , batch_size=batch_size , validation_split=0.1)

Epoch 1/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.1671 - loss: 2.8976 - val_accuracy: 0.2298 - val_loss: 2.2103
Epoch 2/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.2635 - loss: 2.2749 - val_accuracy: 0.4700 - val_loss: 2.0175
Epoch 3/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.4706 - loss: 2.7026 - val_accuracy: 0.8407 - val_loss: 1.0040
Epoch 4/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8458 - loss: 0.8042 - val_accuracy: 0.9008 - val_loss: 0.5014
Epoch 5/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9001 - loss: 0.4827 - val_accuracy: 0.9648 - val_loss: 0.2090
Epoch 6/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9350 - loss: 0.3130 - val_accuracy: 0.9804 - val_loss: 0.1267
Epoch 7/50
[1m54/54[0m [32m━━━━

In [168]:
history_lstm = lstm_model.fit(X_padded, y_one_hot , epochs=epochs , batch_size=batch_size , validation_split=0.1)

Epoch 1/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 50ms/step - accuracy: 0.1752 - loss: 3.0601 - val_accuracy: 0.2755 - val_loss: 2.1993
Epoch 2/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.3729 - loss: 2.2021 - val_accuracy: 0.4517 - val_loss: 1.9474
Epoch 3/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.5033 - loss: 1.7749 - val_accuracy: 0.5914 - val_loss: 1.2869
Epoch 4/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.7574 - loss: 1.1285 - val_accuracy: 0.9008 - val_loss: 0.6861
Epoch 5/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step - accuracy: 0.8861 - loss: 0.6758 - val_accuracy: 0.9739 - val_loss: 0.3678
Epoch 6/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 65ms/step - accuracy: 0.9337 - loss: 0.4524 - val_accuracy: 0.9804 - val_loss: 0.2384
Epoch 7/50
[1m54/54[0m [32m━━━━

In [169]:
lstm_model.save('lstm_model.h5')



In [170]:
index_to_word  = {}
for word , index in word_index.items():
    index_to_word[index] = word

In [171]:
def predictor(model , tokenizer , text , max_len):
    text.lower()
    seq = tokenizer.texts_to_sequences(text)[0]
    seq = pad_sequences([seq] , maxlen=max_len , padding='pre')
    pred = model.predict(seq , verbose = 0 )
    pred_index = np.argmax(pred)
    return index_to_word[pred_index]


In [187]:
seed_text = 'we '
next_word = predictor(lstm_model , tokenizer , seed_text , max_len)
next_word

'have'

In [186]:
seed_text = 'It always seems'
next_word = predictor(rnn_model , tokenizer , seed_text , max_len)
next_word

'cant'