In [1]:
import json
import gzip
from collections import defaultdict, Counter
from typing import Dict, List, Set
import random
import numpy as np
import math
from tqdm import tqdm

In [2]:
# ---------------------------------------
# Utility Loader
# ---------------------------------------

def load_json(path: str):
    with gzip.open(path, "rt", encoding="utf-8") as f:
        return json.load(f)

In [3]:
class CategoryAwarePopularityRecommender():
    """
    Recommends popular books within user's preferred categories.
    """

    def __init__(
        self,
        user_to_review_path: str,
        book_to_review_path: str,
        book_to_category_path: str,
    ):
        self.user_to_review_path = user_to_review_path
        self.book_to_review_path = book_to_review_path
        self.book_to_category_path = book_to_category_path

        self.user_category_pref = defaultdict(Counter)
        self.category_book_popularity = defaultdict(list)

    def fit(self):
        """
        Builds:
            1. User -> preferred categories profile
            2. Category -> popular books ranking
        """
        user_to_review = load_json(self.user_to_review_path)
        book_to_review = load_json(self.book_to_review_path)
        book_to_category = load_json(self.book_to_category_path)

        # -----------------------------
        # Build review -> user map
        # -----------------------------
        review_to_user = {
            str(x["review_id"]): str(x["user_id"])
            for x in user_to_review
        }

        # -----------------------------
        # Build review -> book map
        # -----------------------------
        review_to_book = {
            str(x["review_id"]): str(x["book_id"])
            for x in book_to_review
        }

        # -----------------------------
        # Build book -> categories map
        # -----------------------------
        book_to_categories = defaultdict(set)
        for x in book_to_category:
            book_id = str(x["book_id"])
            category_id = str(x["category_id"])
            book_to_categories[book_id].add(category_id)

        # -----------------------------
        # 1. Build user category preferences
        # -----------------------------
        for rid, user_id in review_to_user.items():
            if rid in review_to_book:
                book_id = review_to_book[rid]
                categories = book_to_categories.get(book_id, [])

                for c in categories:
                    self.user_category_pref[user_id][c] += 1

        # -----------------------------
        # 2. Build category-specific book popularity
        # -----------------------------
        category_counters = defaultdict(Counter)

        for x in book_to_review:
            rid = str(x["review_id"])
            book_id = str(x["book_id"])

            categories = book_to_categories.get(book_id, [])
            for c in categories:
                category_counters[c][book_id] += 1

        # Sort books for each category
        for cat, counter in category_counters.items():
            self.category_book_popularity[cat] = [
                book_id for book_id, _ in counter.most_common()
            ]

    def recommend(self, user_id: str, k: int = 10) -> List[str]:
        """
        Recommends top-K books from user's preferred categories.
        Falls back to random category if user is unseen.
        """
        user_id = str(user_id)

        if user_id not in self.user_category_pref:
            # Cold user → fallback to random category
            if not self.category_book_popularity:
                return []

            random_cat = random.choice(list(self.category_book_popularity.keys()))
            return self.category_book_popularity[random_cat][:k]

        # Top preferred categories
        preferred_categories = [
            c for c, _ in self.user_category_pref[user_id].most_common()
        ]

        recommendations = []
        seen_books = set()

        for cat in preferred_categories:
            for book_id in self.category_book_popularity.get(cat, []):
                if book_id not in seen_books:
                    recommendations.append(book_id)
                    seen_books.add(book_id)

                if len(recommendations) >= k:
                    return recommendations

        return recommendations

In [4]:
class RankingEvaluator:
    """
    Evaluation class for Top-N Recommendation.

    Supports:
        - Hit@K
        - MRR@K
        - NDCG@K

    Input format:
        predictions: Dict[user_id, List[item_id]]
        ground_truth: Dict[user_id, Set[item_id]]
    """

    def __init__(self, k: int = 10):
        """
        Parameters
        ----------
        k : int
            Cutoff rank for evaluation (e.g., 5, 10, 20)
        """
        self.k = k

    # -------------------------
    # Hit@K
    # -------------------------
    def hit_at_k(self, predictions: Dict, ground_truth: Dict) -> float:
        """
        Computes Hit@K

        Returns
        -------
        float : Average Hit@K over users
        """
        hits = []

        for user in ground_truth:
            if user not in predictions:
                continue

            top_k = predictions[user][:self.k]
            gt_items = ground_truth[user]

            hit = 1.0 if any(item in gt_items for item in top_k) else 0.0
            hits.append(hit)

        return float(np.mean(hits)) if hits else 0.0

    # -------------------------
    # MRR@K
    # -------------------------
    def mrr_at_k(self, predictions: Dict, ground_truth: Dict) -> float:
        """
        Computes Mean Reciprocal Rank (MRR@K)

        Returns
        -------
        float : Average MRR over users
        """
        rr_scores = []

        for user in ground_truth:
            if user not in predictions:
                continue

            top_k = predictions[user][:self.k]
            gt_items = ground_truth[user]

            rr = 0.0
            for rank, item in enumerate(top_k, start=1):
                if item in gt_items:
                    rr = 1.0 / rank
                    break

            rr_scores.append(rr)

        return float(np.mean(rr_scores)) if rr_scores else 0.0

    # -------------------------
    # NDCG@K
    # -------------------------
    def ndcg_at_k(self, predictions: Dict, ground_truth: Dict) -> float:
        """
        Computes Normalized Discounted Cumulative Gain (NDCG@K)

        Returns
        -------
        float : Average NDCG@K over users
        """
        ndcg_scores = []

        for user in tqdm(ground_truth, desc="Evaluating"):
            if user not in predictions:
                continue

            top_k = predictions[user][:self.k]
            gt_items = ground_truth[user]

            dcg = 0.0
            for i, item in enumerate(top_k):
                if item in gt_items:
                    dcg += 1.0 / math.log2(i + 2)

            ideal_hits = min(len(gt_items), self.k)
            idcg = sum(1.0 / math.log2(i + 2) for i in range(ideal_hits))

            ndcg = dcg / idcg if idcg > 0 else 0.0
            ndcg_scores.append(ndcg)

        return float(np.mean(ndcg_scores)) if ndcg_scores else 0.0

    # -------------------------
    # Combined Evaluation
    # -------------------------
    def evaluate(self, predictions: Dict, ground_truth: Dict) -> Dict[str, float]:
        """
        Computes all metrics together.

        Returns
        -------
        Dict[str, float]
        """
        return {
            f"Hit@{self.k}": self.hit_at_k(predictions, ground_truth),
            f"MRR@{self.k}": self.mrr_at_k(predictions, ground_truth),
            f"NDCG@{self.k}": self.ndcg_at_k(predictions, ground_truth),
        }

In [5]:
def build_user_book_interactions(
    user_to_review_path: str,
    book_to_review_path: str,
) -> Dict[str, List[str]]:
    """
    Builds:
        user_id -> [book_id, book_id, ...]
    """
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)

    # Normalize IDs to string
    review_to_user = {
        str(x["review_id"]): str(x["user_id"])
        for x in user_to_review
    }

    review_to_book = {
        str(x["review_id"]): str(x["book_id"])
        for x in book_to_review
    }

    user_book = defaultdict(list)

    for rid, user_id in review_to_user.items():
        if rid in review_to_book:
            book_id = review_to_book[rid]
            user_book[user_id].append(book_id)

    return user_book


def split_user_interactions(
    user_book: Dict[str, List[str]],
    seed: int = 42,
):
    """
    Returns:
        train_user_book
        val_user_book
        test_user_book
    """
    random.seed(seed)

    train = {}
    val = {}
    test = {}

    for user, books in user_book.items():
        books = list(set(books))  # remove duplicates
        random.shuffle(books)

        n = len(books)
        if n < 3:
            train[user] = books
            val[user] = []
            test[user] = []
            continue

        n_train = int(0.7 * n)
        n_val = int(0.15 * n)

        train[user] = books[:n_train]
        val[user] = books[n_train : n_train + n_val]
        test[user] = books[n_train + n_val :]

    return train, val, test


def build_ground_truth(test_user_book: Dict[str, List[str]]) -> Dict[str, Set[str]]:
    return {
        user: set(books)
        for user, books in test_user_book.items()
        if len(books) > 0
    }


def generate_predictions(model, users: List[str], k: int):
    predictions = {}

    for u in tqdm(users, desc="Generating Predictions"):
        preds = model.recommend(u, k)
        predictions[u] = preds

    return predictions

In [6]:
def main():
    # -------------------------------
    # Paths
    # -------------------------------
    DATA_FOLDER = r"E:/RokomariBG_Dataset"
    USER_TO_REVIEW = DATA_FOLDER + r"/user_to_review.json.gz"
    BOOK_TO_REVIEW = DATA_FOLDER + r"/book_to_review.json.gz"
    BOOK_TO_CATEGORY = DATA_FOLDER + r"/book_to_category.json.gz"

    K = 10

    # -------------------------------
    # Build user-book interactions
    # -------------------------------
    user_book = build_user_book_interactions(
        USER_TO_REVIEW,
        BOOK_TO_REVIEW,
    )

    print(f"Total users with interactions: {len(user_book)}")

    evaluator = RankingEvaluator(k=K)

    # -------------------------------
    # Split 70/15/15
    # -------------------------------
    train_user_book, val_user_book, test_user_book = split_user_interactions(
        user_book
    )

    ground_truth = build_ground_truth(test_user_book)
    test_users = list(ground_truth.keys())

    print(f"Users in test set: {len(test_users)}")

    # =====================================================
    # CATEGORY-AWARE POPULARITY BASELINE
    # =====================================================

    print("\nTraining Category-Aware Popularity Baseline...")

    cat_model = CategoryAwarePopularityRecommender(
        user_to_review_path=USER_TO_REVIEW,
        book_to_review_path=BOOK_TO_REVIEW,
        book_to_category_path=BOOK_TO_CATEGORY,
    )
    cat_model.fit()

    cat_predictions = generate_predictions(
        cat_model, test_users, K
    )

    cat_metrics = evaluator.evaluate(
        cat_predictions, ground_truth
    )

    print("\n===== Category-Aware Popularity Results =====")
    for m, v in cat_metrics.items():
        print(f"{m}: {v:.4f}")


if __name__ == "__main__":
    main()

Total users with interactions: 63721
Users in test set: 15427

Training Category-Aware Popularity Baseline...


Generating Predictions: 100%|████████████████████████████████████████████████| 15427/15427 [00:00<00:00, 108506.61it/s]
Evaluating: 100%|████████████████████████████████████████████████████████████| 15427/15427 [00:00<00:00, 249258.56it/s]


===== Category-Aware Popularity Results =====
Hit@10: 0.3037
MRR@10: 0.1440
NDCG@10: 0.1382



