# Category-Aware Popularity Recommender
**Bangla Book Recommendation Dataset**

This notebook automatically downloads the dataset from HuggingFace and runs the Category-Aware Popularity baseline recommender.

‚ñ∂Ô∏è **Just click `Runtime ‚Üí Run all` to get started!**

In [1]:
# ============================================================
# STEP 1: Install dependencies & download dataset from HuggingFace
# ============================================================
import os

print("üì¶ Installing huggingface_hub...")
os.system("pip install -q huggingface_hub")

from huggingface_hub import hf_hub_download

REPO_ID = "DevnilMaster1/Bangla-Book-Recommendation-Dataset"
DATA_FOLDER = "RokomariBG_Dataset"
os.makedirs(DATA_FOLDER, exist_ok=True)

# Files exist as plain .json on HuggingFace (not .gz)
FILES_NEEDED = [
    "user_to_review.json",
    "book_to_review.json",
    "book_to_category.json",
]

for filename in FILES_NEEDED:
    dest = os.path.join(DATA_FOLDER, filename)
    if os.path.exists(dest):
        print(f"‚úÖ Already downloaded: {filename}")
    else:
        print(f"‚¨áÔ∏è  Downloading {filename} ...")
        downloaded_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=filename,
            repo_type="dataset",
        )
        # Copy from HF cache to our data folder
        import shutil
        shutil.copy(downloaded_path, dest)
        print(f"‚úÖ Saved: {dest}")

print("\nüéâ All files ready!")

üì¶ Installing huggingface_hub...
‚¨áÔ∏è  Downloading user_to_review.json ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


user_to_review.json:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

‚úÖ Saved: RokomariBG_Dataset/user_to_review.json
‚¨áÔ∏è  Downloading book_to_review.json ...


book_to_review.json:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

‚úÖ Saved: RokomariBG_Dataset/book_to_review.json
‚¨áÔ∏è  Downloading book_to_category.json ...


book_to_category.json:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

‚úÖ Saved: RokomariBG_Dataset/book_to_category.json

üéâ All files ready!


In [2]:
import json
from collections import defaultdict, Counter
from typing import Dict, List, Set
import random
import numpy as np
from tqdm import tqdm

In [3]:
# ---------------------------------------
# Utility Loader
# Note: HuggingFace files are plain .json (not .gz)
# ---------------------------------------

def load_json(path: str):
    """Load JSON file ‚Äî supports both plain .json and gzip .json.gz"""
    if path.endswith(".gz"):
        import gzip
        with gzip.open(path, "rt", encoding="utf-8") as f:
            return json.load(f)
    else:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

In [4]:
class CategoryAwarePopularityRecommender():
    """
    Recommends popular books within user's preferred categories.
    """

    def __init__(
        self,
        user_to_review_path: str,
        book_to_review_path: str,
        book_to_category_path: str,
    ):
        self.user_to_review_path = user_to_review_path
        self.book_to_review_path = book_to_review_path
        self.book_to_category_path = book_to_category_path

        self.user_category_pref = defaultdict(Counter)
        self.category_book_popularity = defaultdict(list)

    def fit(self):
        user_to_review = load_json(self.user_to_review_path)
        book_to_review = load_json(self.book_to_review_path)
        book_to_category = load_json(self.book_to_category_path)

        review_to_user = {
            str(x["review_id"]): str(x["user_id"])
            for x in user_to_review
        }

        review_to_book = {
            str(x["review_id"]): str(x["book_id"])
            for x in book_to_review
        }

        book_to_categories = defaultdict(set)
        for x in book_to_category:
            book_id = str(x["book_id"])
            category_id = str(x["category_id"])
            book_to_categories[book_id].add(category_id)

        for rid, user_id in review_to_user.items():
            if rid in review_to_book:
                book_id = review_to_book[rid]
                categories = book_to_categories.get(book_id, [])
                for c in categories:
                    self.user_category_pref[user_id][c] += 1

        category_counters = defaultdict(Counter)
        for x in book_to_review:
            rid = str(x["review_id"])
            book_id = str(x["book_id"])
            categories = book_to_categories.get(book_id, [])
            for c in categories:
                category_counters[c][book_id] += 1

        for cat, counter in category_counters.items():
            self.category_book_popularity[cat] = [
                book_id for book_id, _ in counter.most_common()
            ]

    def recommend(self, user_id: str, k: int = 10) -> List[str]:
        user_id = str(user_id)

        if user_id not in self.user_category_pref:
            if not self.category_book_popularity:
                return []
            random_cat = random.choice(list(self.category_book_popularity.keys()))
            return self.category_book_popularity[random_cat][:k]

        preferred_categories = [
            c for c, _ in self.user_category_pref[user_id].most_common()
        ]

        recommendations = []
        seen_books = set()

        for cat in preferred_categories:
            for book_id in self.category_book_popularity.get(cat, []):
                if book_id not in seen_books:
                    recommendations.append(book_id)
                    seen_books.add(book_id)
                if len(recommendations) >= k:
                    return recommendations

        return recommendations

In [5]:
class RankingEvaluator:
    """
    Evaluates ranking-based recommendation metrics.
    Computes: Hit@K, MRR@K, NDCG@K for various K values.
    """

    def hit_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        return 1.0 if any(item in ground_truth for item in predictions[:k]) else 0.0

    def mrr_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        for rank, item in enumerate(predictions[:k], start=1):
            if item in ground_truth:
                return 1.0 / rank
        return 0.0

    def dcg_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        return sum(
            1.0 / np.log2(rank + 1)
            for rank, item in enumerate(predictions[:k], start=1)
            if item in ground_truth
        )

    def idcg_at_k(self, ground_truth: Set[str], k: int) -> float:
        ideal_k = min(len(ground_truth), k)
        return sum(1.0 / np.log2(rank + 1) for rank in range(1, ideal_k + 1))

    def ndcg_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        idcg = self.idcg_at_k(ground_truth, k)
        return self.dcg_at_k(predictions, ground_truth, k) / idcg if idcg > 0 else 0.0

    def evaluate(self, predictions: Dict[str, List[str]], ground_truth: Dict[str, Set[str]]) -> Dict[str, float]:
        metrics = {'Hit@5': [], 'Hit@10': [], 'Hit@50': [], 'MRR@10': [], 'NDCG@10': [], 'NDCG@50': []}
        common_users = set(predictions.keys()) & set(ground_truth.keys())

        for user_id in common_users:
            preds = predictions[user_id]
            gt = ground_truth[user_id]
            if len(gt) == 0:
                continue
            metrics['Hit@5'].append(self.hit_at_k(preds, gt, 5))
            metrics['Hit@10'].append(self.hit_at_k(preds, gt, 10))
            metrics['Hit@50'].append(self.hit_at_k(preds, gt, 50))
            metrics['MRR@10'].append(self.mrr_at_k(preds, gt, 10))
            metrics['NDCG@10'].append(self.ndcg_at_k(preds, gt, 10))
            metrics['NDCG@50'].append(self.ndcg_at_k(preds, gt, 50))

        return {k: np.mean(v) if v else 0.0 for k, v in metrics.items()}

In [6]:
def build_user_book_interactions(user_to_review_path, book_to_review_path):
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)

    review_to_user = {str(x["review_id"]): str(x["user_id"]) for x in user_to_review}
    review_to_book = {str(x["review_id"]): str(x["book_id"]) for x in book_to_review}

    user_book = defaultdict(list)
    for rid, user_id in review_to_user.items():
        if rid in review_to_book:
            user_book[user_id].append(review_to_book[rid])

    return user_book


def split_user_interactions(user_book, seed=42):
    random.seed(seed)
    train, val, test = {}, {}, {}

    for user, books in user_book.items():
        books = list(set(books))
        random.shuffle(books)
        n = len(books)

        if n < 3:
            train[user], val[user], test[user] = books, [], []
            continue

        n_train = int(0.7 * n)
        n_val = int(0.15 * n)
        train[user] = books[:n_train]
        val[user] = books[n_train: n_train + n_val]
        test[user] = books[n_train + n_val:]

    return train, val, test


def build_ground_truth(test_user_book):
    return {user: set(books) for user, books in test_user_book.items() if books}


def generate_predictions(model, users, k):
    predictions = {}
    for u in tqdm(users, desc="Generating Predictions"):
        predictions[u] = model.recommend(u, k)
    return predictions

In [7]:
def main():
    # -------------------------------
    # Paths  (plain .json from HuggingFace)
    # -------------------------------
    DATA_FOLDER = "RokomariBG_Dataset/"
    USER_TO_REVIEW   = DATA_FOLDER + "user_to_review.json"
    BOOK_TO_REVIEW   = DATA_FOLDER + "book_to_review.json"
    BOOK_TO_CATEGORY = DATA_FOLDER + "book_to_category.json"

    K = 10

    # Build user-book interactions
    user_book = build_user_book_interactions(USER_TO_REVIEW, BOOK_TO_REVIEW)
    print(f"Total users with interactions: {len(user_book)}")

    evaluator = RankingEvaluator()

    # Split 70/15/15
    train_user_book, val_user_book, test_user_book = split_user_interactions(user_book)
    ground_truth = build_ground_truth(test_user_book)
    test_users = list(ground_truth.keys())
    print(f"Users in test set: {len(test_users)}")

    # =====================================================
    # CATEGORY-AWARE POPULARITY BASELINE
    # =====================================================
    print("\nTraining Category-Aware Popularity Baseline...")

    cat_model = CategoryAwarePopularityRecommender(
        user_to_review_path=USER_TO_REVIEW,
        book_to_review_path=BOOK_TO_REVIEW,
        book_to_category_path=BOOK_TO_CATEGORY,
    )
    cat_model.fit()

    cat_predictions = generate_predictions(cat_model, test_users, K)
    cat_metrics = evaluator.evaluate(cat_predictions, ground_truth)

    print("\n===== Category-Aware Popularity Results =====")
    for m, v in cat_metrics.items():
        print(f"{m}: {v:.4f}")


main()

Total users with interactions: 63721
Users in test set: 15427

Training Category-Aware Popularity Baseline...


Generating Predictions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15427/15427 [00:00<00:00, 41644.88it/s]



===== Category-Aware Popularity Results =====
Hit@5: 0.2374
Hit@10: 0.3052
Hit@50: 0.3052
MRR@10: 0.1469
NDCG@10: 0.1392
NDCG@50: 0.1391
