# Recommendation System

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from db.postgres import PostgresDB

In [2]:
pg = PostgresDB()

### Load data from tables

In [3]:
books_df = pd.DataFrame(pg.fetchall("""
    SELECT id, title, author, year, description, reviews_count, rating, featured_rating, sentiment_score
    FROM library.books
"""), columns=["id", "title", "author", "year", "description", "reviews_count", "rating", "featured_rating", "sentiment_score"])

numeric_cols = ["reviews_count", "rating", "featured_rating", "sentiment_score"]
books_df[numeric_cols] = books_df[numeric_cols].astype(float)

books_df.head()

Unnamed: 0,id,title,author,year,description,reviews_count,rating,featured_rating,sentiment_score
0,73f65a70-8825-5401-bcce-2313da0cc6f8,Crime and Punishment,Fyodor Dostoevsky,1866,Fyodor Dostoevsky's Crime and Punishment is on...,521.0,4.25,4.35,4.26
1,e940f1b3-d333-50b7-a2b6-b95603134438,Love in the Time of Cholera,Gabriel García Márquez,1985,"In their youth, Florentino Ariza and Fermina D...",423.0,3.95,3.67,3.63
2,74ec4ed6-0c42-5425-86bf-51af8793a280,Mansfield Park,Jane Austen,1814,Mansfield Park is the study of three families-...,373.0,3.84,3.95,3.73
3,8368087f-946e-5eb0-b630-a614124d550f,Siddhartha,Hermann Hesse,1922,Hermann Hesse wrote Siddhartha after he travel...,398.0,3.95,3.85,3.99
4,f5c32f05-2e3c-5eac-baec-7504adf01a18,The Thirteenth Tale,Diane Setterfield,2006,"When her health begins failing, the mysterious...",915.0,3.99,4.26,4.69


In [4]:
ratings_df = pd.DataFrame(pg.fetchall("""
    SELECT user_id, book_id, rating
    FROM library.rating
"""), columns=["user_id", "book_id", "rating"])

ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,544c5233-704a-44c3-94ce-865c84d8c09e,73f65a70-8825-5401-bcce-2313da0cc6f8,5
1,544c5233-704a-44c3-94ce-865c84d8c09e,e96b3e47-5a70-5a2f-b1cd-bc3ea9fcd26d,3
2,544c5233-704a-44c3-94ce-865c84d8c09e,d1b93763-39f1-53ab-bd43-7465188f77af,5
3,544c5233-704a-44c3-94ce-865c84d8c09e,9929ad23-b431-5c66-9aa4-79fed6b73eb0,3
4,544c5233-704a-44c3-94ce-865c84d8c09e,426177e2-8662-508b-948e-833ab428be75,5


In [5]:
book_genres_df = pd.DataFrame(pg.fetchall("""
    SELECT bg.book_id, g.name
    FROM library.book_genres bg
    JOIN library.genres g ON g.id = bg.genre_id
"""), columns=["book_id", "genre"])

book_genres_df.head()

Unnamed: 0,book_id,genre
0,73f65a70-8825-5401-bcce-2313da0cc6f8,General Fiction
1,73f65a70-8825-5401-bcce-2313da0cc6f8,Fiction and Literature
2,73f65a70-8825-5401-bcce-2313da0cc6f8,Mystery
3,e940f1b3-d333-50b7-a2b6-b95603134438,General Fiction
4,e940f1b3-d333-50b7-a2b6-b95603134438,Fiction and Literature


### Content-based score

In [6]:
books_df["text_features"] = (
    books_df["title"].fillna('') + " " +
    books_df["author"].fillna('') + " " +
    books_df["description"].fillna('') + " " +
    books_df["id"].map(lambda bid: " ".join(book_genres_df[book_genres_df.book_id == bid]["genre"]))
)

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(books_df["text_features"])
content_sim = cosine_similarity(tfidf_matrix)

content_scores = {
    books_df.iloc[i]["id"]: dict(zip(books_df["id"], content_sim[i]))
    for i in range(len(books_df))
}

### User-based score (item-item CF)

In [7]:
# Create pivot
user_item = ratings_df.pivot_table(index="user_id", columns="book_id", values="rating")
item_item_sim = cosine_similarity(np.nan_to_num(user_item.T.fillna(0)))
item_item_scores = {
    col: dict(zip(user_item.columns, item_item_sim[i]))
    for i, col in enumerate(user_item.columns)
}

### Quality-based score

In [8]:
global_avg = books_df["featured_rating"].mean(skipna=True)
k = 20
books_df["featured_bayes"] = (
    (books_df["reviews_count"].clip(upper=100) / (books_df["reviews_count"].clip(upper=100) + k)) * books_df["featured_rating"].fillna(global_avg) +
    (k / (books_df["reviews_count"].clip(upper=100) + k)) * global_avg
)
books_df["quality_score"] = (
    books_df["featured_bayes"].fillna(global_avg) +
    0.5 * books_df["sentiment_score"].fillna(0) +
    0.5 * np.log1p(books_df["reviews_count"].fillna(0))
)
quality_scores = dict(zip(books_df["id"], books_df["quality_score"]))

### Blend scores & select top-5

In [9]:
top5_recs = []
for base_id in books_df["id"]:
    candidates = set(content_scores.get(base_id, {}).keys()) | set(item_item_scores.get(base_id, {}).keys())
    candidates.discard(base_id)

    blended = []
    for cand_id in candidates:
        s_content = content_scores.get(base_id, {}).get(cand_id, 0)
        s_user = item_item_scores.get(base_id, {}).get(cand_id, 0)
        s_quality = quality_scores.get(cand_id, 0)

        final_score = 0.4 * s_content + 0.4 * s_user + 0.2 * s_quality
        blended.append((cand_id, final_score))

    top5 = sorted(blended, key=lambda x: x[1], reverse=True)[:5]
    for rank, (rec_id, score) in enumerate(top5, start=1):
        top5_recs.append((base_id, rec_id, rank, float(score), "content+user+quality"))

### Insert into book_recs

In [10]:
pg.execute("TRUNCATE library.book_recs")

pg.executemany("""
    INSERT INTO library.book_recs (base_book_id, rec_book_id, rank, score, reason)
    VALUES (%s, %s, %s, %s, %s)
""", top5_recs)

print(f"Inserted {len(top5_recs)} recommendations into book_recs.")

Inserted 1140 recommendations into book_recs.


---