SASRec and Bert Hybrid Recommender Goodreads

In [None]:

from google.colab import drive
drive.mount('/content/drive')


!pip install sentence-transformers tqdm

import torch
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

file_path = '/content/drive/MyDrive/goodreads_interactions_comics_graphic.json'

with open(file_path, 'r') as f:
    lines = f.readlines()

data = [json.loads(line) for line in tqdm(lines, desc="Loading JSONL")]
df = pd.DataFrame(data)
df = df[df['review_text_incomplete'].str.strip() != ""]

# Aggregate Reviews per Book
book_reviews = df.groupby("book_id")["review_text_incomplete"].apply(lambda x: " ".join(x)).reset_index()

# Generate BERT Embeddings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer('all-mpnet-base-v2', device=device)

print("Generating book embeddings...")
book_embeddings = model.encode(
    book_reviews['review_text_incomplete'].tolist(),
    batch_size=64,
    convert_to_tensor=True,
    show_progress_bar=True,
    device=device
)

book_embeddings = book_embeddings.cpu()
book_reviews['embedding'] = [emb for emb in book_embeddings]
book_index = {book_id: idx for idx, book_id in enumerate(book_reviews['book_id'])}

# Cosine Similarity Recommendation
def recommend_books_from_user_profile(user_profile, top_k=5, exclude_ids=None):
    exclude_ids = set(exclude_ids or [])
    sim_scores = torch.nn.functional.cosine_similarity(user_profile, book_embeddings)


    for idx, book_id in enumerate(book_reviews['book_id']):
        if book_id in exclude_ids:
            sim_scores[idx] = -1

    topk = torch.topk(sim_scores, top_k)
    indices = topk.indices.cpu().numpy()
    return [book_reviews.iloc[i]['book_id'] for i in indices]

# Evaluation Metrics
def precision_at_k(recommended, relevant, k):
    return len(set(recommended[:k]) & set(relevant)) / k

def recall_at_k(recommended, relevant, k):
    return len(set(recommended[:k]) & set(relevant)) / len(relevant) if relevant else 0

def ndcg_at_k(recommended, relevant, k):
    dcg = sum([1 / np.log2(i + 2) if item in relevant else 0 for i, item in enumerate(recommended[:k])])
    idcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])
    return dcg / idcg if idcg > 0 else 0

def map_at_k(recommended, relevant, k):
    hits = 0
    sum_precisions = 0
    for i, item in enumerate(recommended[:k]):
        if item in relevant:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / min(len(relevant), k) if relevant else 0

# Evaluation Loop
k = 10
precision_list, recall_list, ndcg_list, map_list = [], [], [], []

print("Evaluating recommender...")
for user_id, group in tqdm(df.groupby("user_id"), desc="Users"):
    if len(group) < 2:
        continue

    books = group['book_id'].tolist()
    test_book = books[-1]
    train_books = [b for b in books[:-1] if b in book_index]

    if not train_books or test_book not in book_index:
        continue

    train_embs = torch.stack([book_reviews.iloc[book_index[b]]['embedding'] for b in train_books])
    user_profile = train_embs.median(dim=0).values.unsqueeze(0).cpu()

    recommended = recommend_books_from_user_profile(user_profile, top_k=k, exclude_ids=train_books)
    relevant = [test_book]

    precision_list.append(precision_at_k(recommended, relevant, k))
    recall_list.append(recall_at_k(recommended, relevant, k))
    ndcg_list.append(ndcg_at_k(recommended, relevant, k))
    map_list.append(map_at_k(recommended, relevant, k))

# Final Metrics Report
print(f"\nEvaluation Results (k={k}):")
print(f"Precision@{k}: {np.mean(precision_list):.4f}")
print(f"Recall@{k}:    {np.mean(recall_list):.4f}")
print(f"NDCG@{k}:      {np.mean(ndcg_list):.4f}")
print(f"MAP@{k}:       {np.mean(map_list):.4f}")


Evaluation Results (k=10):

Precision@10: 0.0050

Recall@10:    0.0499

NDCG@10:      0.0299

MAP@10:       0.0238