In [6]:
# SqlAlchemy and database connection
from sqlalchemy import create_engine

# Standard libraries
import os
import random

# Load environment variables
from dotenv import load_dotenv

# Data manipulation
import pandas as pd
import numpy as np

# Machine learning and clustering
from sklearn.cluster import KMeans

# NLP and embeddings
from sentence_transformers import SentenceTransformer
from faker import Faker

# Deep learning (TensorFlow / Keras)
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [7]:
load_dotenv()

True

In [8]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_name = os.getenv('DB_NAME')

In [9]:
# Create connection string
connection_string = f"mysql+pymysql://{db_user}:{db_password}@{db_host}:3306/{db_name}"

# Create engine
engine = create_engine(connection_string)

In [25]:
table_name = "books"
books = pd.read_sql(f"SELECT * FROM {table_name}" , con=engine)

table_name = "users"
users = pd.read_sql(f"SELECT * FROM {table_name}" , con=engine)

table_name = "interactions"
interactions = pd.read_sql(f"SELECT * FROM {table_name}", con=engine)

In [27]:
books.head()

Unnamed: 0,bookId,title,author,coverImg,language,genres,rating
0,1,The Hunger Games,Suzanne Collins,https://i.gr-assets.com/images/S/compressed.ph...,English,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",4.33
1,2,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPrÃ© (Illustrator)",https://i.gr-assets.com/images/S/compressed.ph...,English,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",4.5
2,3,To Kill a Mockingbird,Harper Lee,https://i.gr-assets.com/images/S/compressed.ph...,English,"['Classics', 'Fiction', 'Historical Fiction', ...",4.28
3,4,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",https://i.gr-assets.com/images/S/compressed.ph...,English,"['Classics', 'Fiction', 'Romance', 'Historical...",4.26
4,5,Twilight,Stephenie Meyer,https://i.gr-assets.com/images/S/compressed.ph...,English,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",3.6


In [28]:
books.shape

(2250, 7)

In [29]:
books['genres'].nunique()

2155

In [30]:
books = books[['bookId', 'title', 'coverImg', 'genres', 'author']]

In [32]:
books.head()

Unnamed: 0,bookId,title,coverImg,genres,author
0,1,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",Suzanne Collins
1,2,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","J.K. Rowling, Mary GrandPrÃ© (Illustrator)"
2,3,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,"['Classics', 'Fiction', 'Historical Fiction', ...",Harper Lee
3,4,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,"['Classics', 'Fiction', 'Romance', 'Historical...","Jane Austen, Anna Quindlen (Introduction)"
4,5,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",Stephenie Meyer


In [33]:
duplicates = books[books.duplicated(keep=False)]

In [35]:
books = books.drop_duplicates()

In [39]:
# -------------------------------
# 4. Clustering using BERT embeddings on book genres
# -------------------------------
model_bert = SentenceTransformer('all-MiniLM-L6-v2')
genre_embeddings = model_bert.encode(books['genres'].astype(str).tolist(), convert_to_tensor=True)

num_clusters = len(books['genres'].unique())
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
books['cluster'] = kmeans.fit_predict(genre_embeddings.cpu().detach().numpy())

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [44]:
# -------------------------------
# 5. Neural Collaborative Filtering (NCF) Model
# -------------------------------

# Positive interactions
pos_interactions = interactions[['user_id', 'book_id']].copy()
pos_interactions['label'] = 1

# Generate negative samples
neg_samples = []
all_book_ids = books['bookId'].values
for _, row in pos_interactions.iterrows():
    user = row['user_id']
    book = np.random.choice(all_book_ids)
    while ((interactions['user_id'] == user) & (interactions['book_id'] == book)).any():
        book = np.random.choice(all_book_ids)
    neg_samples.append([user, book, 0])

neg_interactions = pd.DataFrame(neg_samples, columns=['user_id', 'book_id', 'label'])

In [45]:
# Combine positive and negative samples
train_data = pd.concat([pos_interactions, neg_interactions], ignore_index=True)

In [46]:
# Define maximum user and book ids for embedding layers
num_users = users['user_id'].nunique()
num_books = books['bookId'].max() # Change this line to get max book_id
embedding_size = 64

In [47]:
# Build the NCF model using Keras
# Define inputs
user_input = Input(shape=(1,), name='user_input')
book_input = Input(shape=(1,), name='book_input')

In [48]:
# Embedding layers for users and books
user_embedding = Embedding(input_dim=num_users + 1, output_dim=embedding_size, name='user_embedding')(user_input)
user_vec = Flatten()(user_embedding)

book_embedding = Embedding(input_dim=num_books + 1, output_dim=embedding_size, name='book_embedding')(book_input)
book_vec = Flatten()(book_embedding)

# Concatenate the user and book latent vectors
concat = Concatenate()([user_vec, book_vec])

In [49]:
# MLP layers
dense = Dense(128, activation='relu')(concat)
dense = Dropout(0.2)(dense)
dense = Dense(64, activation='relu')(dense)
dense = Dropout(0.2)(dense)
dense = Dense(32, activation='relu')(dense)

In [50]:
# Output layer predicts the interaction probability
output = Dense(1, activation='sigmoid')(dense)

In [51]:
# Compile the model
ncf_model = Model(inputs=[user_input, book_input], outputs=output)
ncf_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
ncf_model.summary()

In [52]:
# Prepare training arrays
X_user = train_data['user_id'].values
X_book = train_data['book_id'].values
y = train_data['label'].values

In [53]:
# Train the NCF model (adjust epochs and batch size as needed)
ncf_model.fit([X_user, X_book], y, epochs=10, batch_size=256, validation_split=0.2)

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.5947 - loss: 0.6801 - val_accuracy: 0.0000e+00 - val_loss: 0.9681
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.6219 - loss: 0.6533 - val_accuracy: 0.0000e+00 - val_loss: 1.0827
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6305 - loss: 0.6061 - val_accuracy: 0.0569 - val_loss: 1.0837
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6938 - loss: 0.5614 - val_accuracy: 0.2397 - val_loss: 1.2703
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8062 - loss: 0.4461 - val_accuracy: 0.2019 - val_loss: 2.4818
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9434 - loss: 0.1818 - val_accuracy: 0.2397 - val_loss: 3.9694
Epoch 7/10
[1m50/50[0m 

<keras.src.callbacks.history.History at 0x15339611b50>

In [54]:
ncf_model.save('ncf_model.h5')



In [55]:
# -------------------------------
# 6. Book Recommendation Function
# -------------------------------
def recommend_books(user_id, top_n=10):
    if user_id not in users['user_id'].values:
        print(f"User {user_id} not found in dataset. Showing random books.")
        return books.sample(top_n)[['title', 'genres']]

    user_pref = users.loc[users['user_id'] == user_id, 'preferred_genre'].values[0]

    if user_id not in interactions['user_id'].values:
        print(f"New user detected. Recommending based on preferred genre.")
        recommended_books = books[books['genres'].str.contains(user_pref, case=False, na=False)][['title', 'genres']]
        if recommended_books.empty:
            recommended_books = books.sample(top_n)[['title', 'genres']]
        return recommended_books.head(top_n)

    print("Existing user detected. Recommending based on interactions.")

    # Change 'book_id' to 'bookId' to match the column name in the books DataFrame
    candidate_books = books['bookId'].values
    user_array = np.full(len(candidate_books), user_id)
    scores = ncf_model.predict([user_array, candidate_books]).flatten()

    top_indices = scores.argsort()[-top_n:][::-1]
    recommended_book_ids = candidate_books[top_indices]

    cluster_filtered = books[books['genres'].str.contains(user_pref.split()[0], case=False, na=False)]
    if not cluster_filtered.empty:
        cluster_ids = set(cluster_filtered['bookId']) # Change 'book_id' to 'bookId'
        recommended_book_ids = [bid for bid in recommended_book_ids if bid in cluster_ids] or list(candidate_books[top_indices])

    recommended_books = books[books['bookId'].isin(recommended_book_ids)][['title', 'genres']] # Change 'book_id' to 'bookId'
    return recommended_books.head(top_n)

In [65]:
# Example Usage:
user_id_sample = random.randint(1, users['user_id'].max())
print(f"Top {10} books recommended for User {user_id_sample}:")
recommend_books(user_id_sample)

Top 10 books recommended for User 3339:
Existing user detected. Recommending based on interactions.
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


Unnamed: 0,title,genres
46,A Game of Thrones,"['Fantasy', 'Fiction', 'Epic Fantasy', 'Adult'..."
59,The Kite Runner,"['Fiction', 'Historical Fiction', 'Contemporar..."
722,The Jungle Books,"['Classics', 'Fiction', 'Childrens', 'Fantasy'..."
730,Geek Love,"['Fiction', 'Horror', 'Fantasy', 'Contemporary..."
743,To All the Boys I've Loved Before,"['Young Adult', 'Romance', 'Contemporary', 'Fi..."
821,Holy Bible: New International Version,"['Religion', 'Christian', 'Nonfiction', 'Class..."
1853,The Game of Kings,"['Historical Fiction', 'Fiction', 'Historical'..."
1908,Wildwood Dancing,"['Fantasy', 'Young Adult', 'Romance', 'Fairy T..."
1911,Memories of Ice,"['Fantasy', 'Epic Fantasy', 'Fiction', 'High F..."
2246,Graphic Designing Grade 10,['Graphic Designing']


In [None]:
# df = users[users['user_id'] == 1352]