# **Setting up the environment**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
np.random.seed(42)
tf.random.set_seed(42)

# **Pre processing**

In [4]:
data = pd.read_csv("/kaggle/input/book-scraped/myscrapeddata.csv")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   availability    1000 non-null   int64  
 1   category        1000 non-null   object 
 2   description     998 non-null    object 
 3   num_reviews     1000 non-null   int64  
 4   price           1000 non-null   float64
 5   price_excl_tax  1000 non-null   float64
 6   price_incl_tax  1000 non-null   float64
 7   product_type    1000 non-null   object 
 8   stars           1000 non-null   int64  
 9   tax             1000 non-null   float64
 10  title           1000 non-null   object 
 11  upc             1000 non-null   object 
 12  url             1000 non-null   object 
dtypes: float64(4), int64(3), object(6)
memory usage: 101.7+ KB


In [6]:
data['category'].value_counts()

category
default               152
nonfiction            110
sequential art         75
add a comment          67
fiction                65
young adult            54
fantasy                48
romance                35
mystery                32
food and drink         30
childrens              29
historical fiction     26
classics               19
poetry                 19
history                18
horror                 17
womens fiction         17
science fiction        16
science                14
music                  13
business               12
philosophy             11
thriller               11
travel                 11
humor                  10
autobiography           9
art                     8
psychology              7
religion                7
christian fiction       6
spirituality            6
new adult               6
self help               5
sports and games        5
biography               5
health                  4
christian               3
contemporary            3
pol

In [7]:
genre_counts = {
    'young adult': 54,
    'fantasy': 48,
    'romance': 35,
    'mystery': 32,
    'food and drink': 30,
    'childrens': 29,
    'historical fiction': 26,
    'classics': 19,
    'poetry': 19,
    'history': 18,
    'horror': 17,
    'womens fiction': 17,
    'science fiction': 16,
    'science': 14,
    'music': 13,
    'business': 12,
    'philosophy': 11,
    'thriller': 11,
    'travel': 11,
    'humor': 10,
    'autobiography': 9,
    'art': 8,
    'psychology': 7,
    'religion': 7,
    'christian fiction': 6,
    'spirituality': 6,
    'new adult': 6,
    'self help': 5,
    'sports and games': 5,
    'biography': 5,
    'health': 4,
    'christian': 3,
    'contemporary': 3,
    'politics': 3,
    'historical': 2,
    'cultural': 1,
    'erotica': 1,
    'crime': 1,
    'novels': 1,
    'short stories': 1,
    'suspense': 1,
    'academic': 1,
    'adult fiction': 1,
    'parenting': 1,
    'paranormal': 1
}
categories_to_drop = list(genre_counts.keys())
data = data[~data['category'].isin(categories_to_drop)]

In [8]:
data.head()

Unnamed: 0,availability,category,description,num_reviews,price,price_excl_tax,price_incl_tax,product_type,stars,tax,title,upc,url
4,19,sequential art,Scott Pilgrim's life is totally sweet. He's 23...,0,52.29,52.29,52.29,books,5,0.0,Scott Pilgrim's Precious Little Life (Scott Pi...,3b1c02bac2a429e6,https://books.toscrape.com/catalogue/scott-pil...
6,19,default,For readers of Laura Hillenbrand's Seabiscuit ...,0,22.6,22.6,22.6,books,4,0.0,The Boys in the Boat: Nine Americans and Their...,e10e1e165dc8be4a,https://books.toscrape.com/catalogue/the-boys-...
7,19,default,"""If you have a heart, if you have a soul, Kare...",0,17.93,17.93,17.93,books,3,0.0,The Coming Woman: A Novel Based on the Life of...,e72a5dfc7e9267b2,https://books.toscrape.com/catalogue/the-comin...
14,18,default,This new edition of Maude (1883-1993) has been...,0,18.02,18.02,18.02,books,2,0.0,Maude (1883-1993):She Grew Up with the country,094b269567e1c300,https://books.toscrape.com/catalogue/maude-188...
15,18,default,"Sent to yet another foster family, Penny decid...",0,33.29,33.29,33.29,books,3,0.0,Penny Maybe,668fe56b17cfcd4f,https://books.toscrape.com/catalogue/penny-may...


In [9]:
data = data[['category','description']]

In [10]:
data.head()

Unnamed: 0,category,description
4,sequential art,Scott Pilgrim's life is totally sweet. He's 23...
6,default,For readers of Laura Hillenbrand's Seabiscuit ...
7,default,"""If you have a heart, if you have a soul, Kare..."
14,default,This new edition of Maude (1883-1993) has been...
15,default,"Sent to yet another foster family, Penny decid..."


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 469 entries, 4 to 997
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     469 non-null    object
 1   description  468 non-null    object
dtypes: object(2)
memory usage: 11.0+ KB


In [12]:
data = data.dropna()

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 468 entries, 4 to 997
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     468 non-null    object
 1   description  468 non-null    object
dtypes: object(2)
memory usage: 11.0+ KB


In [14]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
data[['category']] = encoder.fit_transform(data[['category']])

In [15]:
X = data['description']
y = data['category']

In [16]:
y.value_counts()

category
1.0    151
3.0    110
4.0     75
0.0     67
2.0     65
Name: count, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split

X_train , X_valid , y_train , y_valid = train_test_split(X , y , test_size=0.9 , random_state=42)

In [18]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [19]:
'''X_train = np.array(X_train)
y_train = np.array(y_train)
X_valid = np.array(X_valid)
y_valid = np.array(y_valid)'''

'X_train = np.array(X_train)\ny_train = np.array(y_train)\nX_valid = np.array(X_valid)\ny_valid = np.array(y_valid)'

In [20]:
'''X_train = [item for sublist in X_train for item in sublist]
X_valid = [item for sublist in X_valid for item in sublist]'''

'X_train = [item for sublist in X_train for item in sublist]\nX_valid = [item for sublist in X_valid for item in sublist]'

In [21]:
train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train))
valid_set = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))

In [22]:
tf.random.set_seed(42)
vocab_size = 2000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

In [23]:
print("Vocabulary:", text_vec_layer.get_vocabulary())



# **RNNs**

In [24]:
embedding_dim = 32  
num_classes = 5

In [25]:
tf.random.set_seed(42)
tf.keras.backend.clear_session()

model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


history = model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
tf.random.set_seed(42)
tf.keras.backend.clear_session()

model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim,mask_zero=True),
    tf.keras.layers.GRU(256),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


history = model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
text_vec_layer_ragged = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, ragged=True)
text_vec_layer_ragged.adapt(train_set.map(lambda reviews, labels: reviews))

In [28]:
tf.random.set_seed(42)
tf.keras.backend.clear_session()

model = tf.keras.Sequential([
    text_vec_layer_ragged,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.GRU(512),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


history = model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
tf.random.set_seed(42)
tf.keras.backend.clear_session()

model = tf.keras.Sequential([
    text_vec_layer_ragged,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.LSTM(1024),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='nadam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


history = model.fit(X_train, y_train,epochs=30,callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
