# **Quets Genration using LSTM**

## **Import Dependencies and Loading Data**

##**Drive mount**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Import Libraries**

In [4]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from tensorflow.keras.preprocessing.sequence import pad_sequences

## **Scraped Data from web**

In [5]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive'
}

pages_ = 50
quots_list = []
author_list = []

for page in tqdm(range(1, pages_ + 1)):
    url = f"https://www.azquotes.com/top_quotes.html?p={page}"
    response = requests.get(url, headers=headers)
    bf_soup =  BeautifulSoup(response.content, 'html.parser')
    quots_list.extend(bf_soup.find_all('a',class_="title"))
    author_list.extend(bf_soup.find_all('div',class_="author"))

quots_list = [i.text for i in quots_list]
author_list = [i.text for i in author_list]


100%|██████████| 50/50 [00:23<00:00,  2.16it/s]


## **Create DF of scraped data**

In [7]:
pd.DataFrame({'quots':quots_list,'author':author_list}).to_csv("/content/drive/MyDrive/Colab Notebooks/df_quots.csv")


In [8]:
print("No of quots",len(quots_list),len(author_list))

No of quots 1000 1000


# **Loading Data From Remote Repository**

In [9]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/df_quots.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,quots,author
0,0,The essence of strategy is choosing what not t...,\nMichael Porter\n
1,1,One cannot and must not try to erase the past ...,\nGolda Meir\n
2,2,Patriotism means to stand by the country. It d...,\nTheodore Roosevelt\n
3,3,Death is something inevitable. When a man has ...,\nNelson Mandela\n
4,4,You have to love a nation that celebrates its ...,\nErma Bombeck\n


In [16]:
df.quots[8]


'Never be afraid to raise your voice for honesty and truth and compassion against injustice and lying and greed. If people all over the world...would do this, it would change the earth.'

# **Initialize Tokenizer**

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.quots)

In [15]:
print("Length of Voc",len(tokenizer.word_index))

Length of Voc 2869


## **Create inputs and thire sequence**

In [18]:
input_sequences = []
for line in df.quots:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [20]:
max([len(x) for x in input_sequences])

98

## **Add Padding**

In [21]:
padding_inputs = pad_sequences(input_sequences, maxlen=98, padding='pre')

## **Splitting data into inputs and target**

In [22]:
x = padding_inputs[:, :-1]
y = padding_inputs[:, -1]

In [23]:
print("Shape of x data: ",x.shape)
print("Shape of y data: ",y.shape)

Shape of x data:  (17802, 97)
Shape of y data:  (17802,)


In [27]:
y[0:5]

array([1287,    5,  846,    4, 1288], dtype=int32)

In [24]:
to_categorical(y,num_classes=len(tokenizer.word_index)+1)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

## **Reshape target in multiclass**

In [30]:
y = to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [31]:
y[0:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [33]:
len(tokenizer.word_index)+1

2870

# **Initialize Model**

In [35]:
model = Sequential()

model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=98-1))
model.add(LSTM(500, return_sequences=True))
model.add(LSTM(500))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 97, 100)           287000    
                                                                 
 lstm (LSTM)                 (None, 97, 500)           1202000   
                                                                 
 lstm_1 (LSTM)               (None, 500)               2002000   
                                                                 
 dense (Dense)               (None, 2870)              1437870   
                                                                 
Total params: 4928870 (18.80 MB)
Trainable params: 4928870 (18.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [37]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [39]:
model.fit(x, y, epochs=50, verbose=1, validation_split=0.2,)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7aa04d7c6200>

In [52]:
import tensorflow as tf
model.save("/content/drive/MyDrive/Colab Notebooks/QuotesGenration/model.keras")

In [53]:
import time

# text = "revenge"
def results(text,words):
  for i in range(int(words)):
    # tokenize
    token_text = tokenizer.texts_to_sequences([text])[0]
    # padding
    padded_token_text = pad_sequences([token_text], maxlen=97, padding='pre')
    # predict
    pos = np.argmax(model.predict(padded_token_text))

    for word,index in tokenizer.word_index.items():
      if index == pos:
        text = text + " " + word
        break

  return text

In [63]:
results("Patriotism",15)



'Patriotism means to stand by the country it does not mean to stand by the president'