# Global Popularity Recommender
**Bangla Book Recommendation Dataset**

‚ñ∂Ô∏è **Just click `Runtime ‚Üí Run all` to get started!**

In [1]:
# ============================================================
# Install dependencies
# ============================================================
import os
os.system("pip install -q huggingface_hub tqdm")
print("‚úÖ Packages installed!")

# ============================================================
# Download dataset from HuggingFace
# ============================================================
import os, shutil
from huggingface_hub import hf_hub_download

REPO_ID = "DevnilMaster1/Bangla-Book-Recommendation-Dataset"
DATA_FOLDER = "RokomariBG_Dataset"
os.makedirs(DATA_FOLDER, exist_ok=True)

FILES_NEEDED = ['user_to_review.json', 'book_to_review.json']

for filename in FILES_NEEDED:
    dest = os.path.join(DATA_FOLDER, filename)
    if os.path.exists(dest):
        print(f"‚úÖ Already downloaded: {filename}")
    else:
        print(f"‚¨áÔ∏è  Downloading {filename} ...")
        downloaded_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=filename,
            repo_type="dataset",
        )
        shutil.copy(downloaded_path, dest)
        print(f"‚úÖ Saved: {dest}")

print("\nüéâ All files ready!")

‚úÖ Packages installed!
‚¨áÔ∏è  Downloading user_to_review.json ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


user_to_review.json:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

‚úÖ Saved: RokomariBG_Dataset/user_to_review.json
‚¨áÔ∏è  Downloading book_to_review.json ...


book_to_review.json:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

‚úÖ Saved: RokomariBG_Dataset/book_to_review.json

üéâ All files ready!


In [2]:
import json
import gzip
from collections import defaultdict, Counter
from typing import Dict, List, Set
import random
import numpy as np
from tqdm.auto import tqdm

In [3]:
# ---------------------------------------
# Utility Loader
# ---------------------------------------

def load_json(path: str):
    """Supports both plain .json and gzip .json"""
    if path.endswith(".gz"):
        import gzip
        with gzip.open(path, "rt", encoding="utf-8") as f:
            return json.load(f)
    else:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

In [4]:
class GlobalPopularityRecommender():
    """
    Recommends globally most popular books based on review counts.
    """

    def __init__(self, book_to_review_path: str):
        self.book_to_review_path = book_to_review_path
        self.popular_books = []

    def fit(self):
        """
        Computes global book popularity.
        """
        book_to_review = load_json(self.book_to_review_path)

        book_counter = Counter()

        for x in book_to_review:
            book_id = str(x["book_id"])
            book_counter[book_id] += 1

        # Sort by descending popularity
        self.popular_books = [
            book_id for book_id, _ in book_counter.most_common()
        ]

    def recommend(self, user_id: str = None, k: int = 10) -> List[str]:
        """
        Recommends top-K globally popular books.
        User ID is ignored.
        """
        return self.popular_books[:k]


In [5]:
class RankingEvaluator:
    """
    Evaluates ranking-based recommendation metrics.
    Computes: Hit@K, MRR@K, NDCG@K for various K values.
    """

    def __init__(self):
        pass

    def hit_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        Hit@K: 1 if at least one relevant item is in top-K, else 0
        """
        top_k = predictions[:k]
        return 1.0 if any(item in ground_truth for item in top_k) else 0.0

    def mrr_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        MRR@K: Reciprocal rank of first relevant item in top-K
        """
        top_k = predictions[:k]
        for rank, item in enumerate(top_k, start=1):
            if item in ground_truth:
                return 1.0 / rank
        return 0.0

    def dcg_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        DCG@K: Discounted Cumulative Gain
        """
        dcg = 0.0
        top_k = predictions[:k]
        for rank, item in enumerate(top_k, start=1):
            if item in ground_truth:
                dcg += 1.0 / np.log2(rank + 1)
        return dcg

    def idcg_at_k(self, ground_truth: Set[str], k: int) -> float:
        """
        IDCG@K: Ideal DCG (best possible DCG)
        """
        ideal_k = min(len(ground_truth), k)
        idcg = sum(1.0 / np.log2(rank + 1) for rank in range(1, ideal_k + 1))
        return idcg

    def ndcg_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        NDCG@K: Normalized Discounted Cumulative Gain
        """
        dcg = self.dcg_at_k(predictions, ground_truth, k)
        idcg = self.idcg_at_k(ground_truth, k)

        if idcg == 0.0:
            return 0.0

        return dcg / idcg

    def evaluate(self, predictions: Dict[str, List[str]], ground_truth: Dict[str, Set[str]]) -> Dict[str, float]:
        """
        Evaluate predictions against ground truth.

        Args:
            predictions: Dict mapping user_id -> list of recommended item_ids (ranked)
            ground_truth: Dict mapping user_id -> set of relevant item_ids

        Returns:
            Dict of metric_name -> average_score
        """
        metrics = {
            'Hit@5': [],
            'Hit@10': [],
            'Hit@50': [],
            'MRR@10': [],
            'NDCG@10': [],
            'NDCG@50': []
        }

        # Only evaluate users present in both predictions and ground_truth
        common_users = set(predictions.keys()) & set(ground_truth.keys())

        for user_id in common_users:
            preds = predictions[user_id]
            gt = ground_truth[user_id]

            # Skip users with no ground truth items
            if len(gt) == 0:
                continue

            # Compute metrics
            metrics['Hit@5'].append(self.hit_at_k(preds, gt, 5))
            metrics['Hit@10'].append(self.hit_at_k(preds, gt, 10))
            metrics['Hit@50'].append(self.hit_at_k(preds, gt, 50))
            metrics['MRR@10'].append(self.mrr_at_k(preds, gt, 10))
            metrics['NDCG@10'].append(self.ndcg_at_k(preds, gt, 10))
            metrics['NDCG@50'].append(self.ndcg_at_k(preds, gt, 50))

        # Average across all users
        results = {}
        for metric_name, values in metrics.items():
            if len(values) > 0:
                results[metric_name] = np.mean(values)
            else:
                results[metric_name] = 0.0

        return results

In [6]:
def build_user_book_interactions(
    user_to_review_path: str,
    book_to_review_path: str,
) -> Dict[str, List[str]]:
    """
    Builds:
        user_id -> [book_id, book_id, ...]
    """
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)

    # Normalize IDs to string
    review_to_user = {
        str(x["review_id"]): str(x["user_id"])
        for x in user_to_review
    }

    review_to_book = {
        str(x["review_id"]): str(x["book_id"])
        for x in book_to_review
    }

    user_book = defaultdict(list)

    for rid, user_id in review_to_user.items():
        if rid in review_to_book:
            book_id = review_to_book[rid]
            user_book[user_id].append(book_id)

    return user_book


# -------------------------------------------------
# 2. Per-User 70/15/15 Split
# -------------------------------------------------

def split_user_interactions(
    user_book: Dict[str, List[str]],
    seed: int = 42,
):
    """
    Returns:
        train_user_book
        val_user_book
        test_user_book
    """
    random.seed(seed)

    train = {}
    val = {}
    test = {}

    for user, books in user_book.items():
        books = list(set(books))  # remove duplicates
        random.shuffle(books)

        n = len(books)
        if n < 3:
            train[user] = books
            val[user] = []
            test[user] = []
            continue

        n_train = int(0.7 * n)
        n_val = int(0.15 * n)

        train[user] = books[:n_train]
        val[user] = books[n_train : n_train + n_val]
        test[user] = books[n_train + n_val :]

    return train, val, test


# -------------------------------------------------
# 3. Convert Test Set to Ground Truth Format
# -------------------------------------------------

def build_ground_truth(test_user_book: Dict[str, List[str]]) -> Dict[str, Set[str]]:
    return {
        user: set(books)
        for user, books in test_user_book.items()
        if len(books) >0
    }


# -------------------------------------------------
# 4. Generate Predictions for All Users
# -------------------------------------------------

def generate_predictions(model, users: List[str], k: int):
    predictions = {}

    for u in tqdm(users, desc="Generating Predictions"):
        preds = model.recommend(u, k)
        predictions[u] = preds

    return predictions


def build_train_user_book_rating(
    train_user_book,
    user_to_review_path,
    book_to_review_path,
    review_path,
):
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)
    reviews = load_json(review_path)

    review_to_user = {str(x["review_id"]): str(x["user_id"]) for x in user_to_review}
    review_to_book = {str(x["review_id"]): str(x["book_id"]) for x in book_to_review}
    review_to_rating = {str(x["review_id"]): float(x["userRating"]) for x in reviews}

    train_ratings = defaultdict(dict)

    for rid, user in review_to_user.items():
        if rid in review_to_book and rid in review_to_rating:
            book = review_to_book[rid]
            rating = review_to_rating[rid]

            if rating > 0 and user in train_user_book and book in train_user_book[user]:
                train_ratings[user][book] = rating

    return train_ratings

In [7]:
def main():
    # -------------------------------
    # Paths
    # -------------------------------
    DATA_FOLDER = "RokomariBG_Dataset/"
    USER_TO_REVIEW = DATA_FOLDER+"user_to_review.json"
    BOOK_TO_REVIEW = DATA_FOLDER+"book_to_review.json"
    BOOK_TO_CATEGORY = DATA_FOLDER+"book_to_category.json"

    K = 10

    # -------------------------------
    # Build user-book interactions
    # -------------------------------
    user_book = build_user_book_interactions(
        USER_TO_REVIEW,
        BOOK_TO_REVIEW,
    )

    print(f"Total users with interactions: {len(user_book)}")

    evaluator = RankingEvaluator()

    # -------------------------------
    # Split 70/15/15
    # -------------------------------
    train_user_book, val_user_book, test_user_book = split_user_interactions(
        user_book
    )

    ground_truth = build_ground_truth(test_user_book)
    test_users = list(ground_truth.keys())

    print(f"Users in test set: {len(test_users)}")

    # =====================================================
    # GLOBAL POPULARITY BASELINE
    # =====================================================

    print("\nTraining Global Popularity Baseline...")

    global_model = GlobalPopularityRecommender(
        book_to_review_path=BOOK_TO_REVIEW
    )
    global_model.fit()

    global_predictions = generate_predictions(
        global_model, test_users, K
    )

    global_metrics = evaluator.evaluate(
        global_predictions, ground_truth
    )

    print("\n===== Global Popularity Results =====")
    for m, v in global_metrics.items():
        print(f"{m}: {v:.4f}")


main()

Total users with interactions: 63721
Users in test set: 15427

Training Global Popularity Baseline...


Generating Predictions:   0%|          | 0/15427 [00:00<?, ?it/s]


===== Global Popularity Results =====
Hit@5: 0.0941
Hit@10: 0.1493
Hit@50: 0.1493
MRR@10: 0.0546
NDCG@10: 0.0551
NDCG@50: 0.0550
