In [1]:
import json
import gzip
from collections import defaultdict
from typing import Dict, List, Set
import random
import numpy as np
import math
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------------------
# Utility Loader
# ---------------------------------------

def load_json(path: str):
    with gzip.open(path, "rt", encoding="utf-8") as f:
        return json.load(f)

In [3]:
class ItemBasedCFRecommender():
    """
    Item-based kNN Collaborative Filtering using SPARSE cosine similarity.
    Trained only on train_user_book interactions.
    """

    def __init__(self, train_user_book: Dict[str, List[str]], k_neighbors: int = 50):
        self.train_user_book = {u: list(set(books)) for u, books in train_user_book.items()}
        self.k_neighbors = k_neighbors

        self.user_ids = []
        self.book_ids = []
        self.user_id_map = {}
        self.book_id_map = {}
        self.interaction_matrix = None
        self.item_sim_matrix = None

    def fit(self):
        print("Building Item-Based CF (Sparse, train-only)...")

        user_book = self.train_user_book

        self.user_ids = sorted(user_book.keys())
        self.book_ids = sorted({b for books in user_book.values() for b in books})

        self.user_id_map = {u: i for i, u in enumerate(self.user_ids)}
        self.book_id_map = {b: i for i, b in enumerate(self.book_ids)}

        # Build sparse interaction matrix
        rows, cols = [], []
        print("Building sparse interaction matrix...")
        for u, books in tqdm(user_book.items()):
            ui = self.user_id_map[u]
            for b in books:
                bi = self.book_id_map[b]
                rows.append(ui)
                cols.append(bi)

        data = np.ones(len(rows), dtype=np.float32)
        self.interaction_matrix = csr_matrix(
            (data, (rows, cols)),
            shape=(len(self.user_ids), len(self.book_ids)),
        )

        # Item-item cosine similarity
        print("Computing item-item similarity (this may take 5–15 minutes)...")
        self.item_sim_matrix = cosine_similarity(
            self.interaction_matrix.T, dense_output=False
        )

    def recommend(self, user_id: str, k: int = 10):
        if user_id not in self.user_id_map:
            return []

        uidx = self.user_id_map[user_id]
        user_vector = self.interaction_matrix.getrow(uidx)

        scores = np.zeros(len(self.book_ids), dtype=np.float32)
        interacted_items = user_vector.indices

        for item_idx in interacted_items:
            sim_vec = self.item_sim_matrix.getrow(item_idx).toarray().ravel()
            neighbors = np.argsort(sim_vec)[::-1][1 : self.k_neighbors + 1]
            scores[neighbors] += sim_vec[neighbors]

        # Filter out items already seen in TRAIN
        scores[interacted_items] = -1

        top_items = np.argsort(scores)[::-1][:k]
        return [self.book_ids[i] for i in top_items]

In [4]:
class RankingEvaluator:
    """
    Evaluation class for Top-N Recommendation.

    Supports:
        - Hit@K
        - MRR@K
        - NDCG@K

    Input format:
        predictions: Dict[user_id, List[item_id]]
        ground_truth: Dict[user_id, Set[item_id]]
    """

    def __init__(self, k: int = 10):
        """
        Parameters
        ----------
        k : int
            Cutoff rank for evaluation (e.g., 5, 10, 20)
        """
        self.k = k

    # -------------------------
    # Hit@K
    # -------------------------
    def hit_at_k(self, predictions: Dict, ground_truth: Dict) -> float:
        """
        Computes Hit@K

        Returns
        -------
        float : Average Hit@K over users
        """
        hits = []

        for user in ground_truth:
            if user not in predictions:
                continue

            top_k = predictions[user][:self.k]
            gt_items = ground_truth[user]

            hit = 1.0 if any(item in gt_items for item in top_k) else 0.0
            hits.append(hit)

        return float(np.mean(hits)) if hits else 0.0

    # -------------------------
    # MRR@K
    # -------------------------
    def mrr_at_k(self, predictions: Dict, ground_truth: Dict) -> float:
        """
        Computes Mean Reciprocal Rank (MRR@K)

        Returns
        -------
        float : Average MRR over users
        """
        rr_scores = []

        for user in ground_truth:
            if user not in predictions:
                continue

            top_k = predictions[user][:self.k]
            gt_items = ground_truth[user]

            rr = 0.0
            for rank, item in enumerate(top_k, start=1):
                if item in gt_items:
                    rr = 1.0 / rank
                    break

            rr_scores.append(rr)

        return float(np.mean(rr_scores)) if rr_scores else 0.0

    # -------------------------
    # NDCG@K
    # -------------------------
    def ndcg_at_k(self, predictions: Dict, ground_truth: Dict) -> float:
        """
        Computes Normalized Discounted Cumulative Gain (NDCG@K)

        Returns
        -------
        float : Average NDCG@K over users
        """
        ndcg_scores = []

        for user in tqdm(ground_truth, desc="Evaluating"):
            if user not in predictions:
                continue

            top_k = predictions[user][:self.k]
            gt_items = ground_truth[user]

            dcg = 0.0
            for i, item in enumerate(top_k):
                if item in gt_items:
                    dcg += 1.0 / math.log2(i + 2)

            ideal_hits = min(len(gt_items), self.k)
            idcg = sum(1.0 / math.log2(i + 2) for i in range(ideal_hits))

            ndcg = dcg / idcg if idcg > 0 else 0.0
            ndcg_scores.append(ndcg)

        return float(np.mean(ndcg_scores)) if ndcg_scores else 0.0

    # -------------------------
    # Combined Evaluation
    # -------------------------
    def evaluate(self, predictions: Dict, ground_truth: Dict) -> Dict[str, float]:
        """
        Computes all metrics together.

        Returns
        -------
        Dict[str, float]
        """
        return {
            f"Hit@{self.k}": self.hit_at_k(predictions, ground_truth),
            f"MRR@{self.k}": self.mrr_at_k(predictions, ground_truth),
            f"NDCG@{self.k}": self.ndcg_at_k(predictions, ground_truth),
        }

In [5]:
def build_user_book_interactions(
    user_to_review_path: str,
    book_to_review_path: str,
) -> Dict[str, List[str]]:
    """
    Builds:
        user_id -> [book_id, book_id, ...]
    """
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)

    # Normalize IDs to string
    review_to_user = {
        str(x["review_id"]): str(x["user_id"])
        for x in user_to_review
    }

    review_to_book = {
        str(x["review_id"]): str(x["book_id"])
        for x in book_to_review
    }

    user_book = defaultdict(list)

    for rid, user_id in review_to_user.items():
        if rid in review_to_book:
            book_id = review_to_book[rid]
            user_book[user_id].append(book_id)

    return user_book


def split_user_interactions(
    user_book: Dict[str, List[str]],
    seed: int = 42,
):
    """
    Returns:
        train_user_book
        val_user_book
        test_user_book
    """
    random.seed(seed)

    train = {}
    val = {}
    test = {}

    for user, books in user_book.items():
        books = list(set(books))  # remove duplicates
        random.shuffle(books)

        n = len(books)
        if n < 3:
            train[user] = books
            val[user] = []
            test[user] = []
            continue

        n_train = int(0.7 * n)
        n_val = int(0.15 * n)

        train[user] = books[:n_train]
        val[user] = books[n_train : n_train + n_val]
        test[user] = books[n_train + n_val :]

    return train, val, test


def build_ground_truth(test_user_book: Dict[str, List[str]]) -> Dict[str, Set[str]]:
    return {
        user: set(books)
        for user, books in test_user_book.items()
        if len(books) > 0
    }


def generate_predictions(model, users: List[str], k: int):
    predictions = {}

    for u in tqdm(users, desc="Generating Predictions"):
        preds = model.recommend(u, k)
        predictions[u] = preds

    return predictions

In [6]:
def main():
    # -------------------------------
    # Paths
    # -------------------------------
    DATA_FOLDER = r"E:/RokomariBG_Dataset"
    USER_TO_REVIEW = DATA_FOLDER + r"/user_to_review.json.gz"
    BOOK_TO_REVIEW = DATA_FOLDER + r"/book_to_review.json.gz"

    K = 10

    # -------------------------------
    # Build user-book interactions
    # -------------------------------
    user_book = build_user_book_interactions(
        USER_TO_REVIEW,
        BOOK_TO_REVIEW,
    )

    print(f"Total users with interactions: {len(user_book)}")

    evaluator = RankingEvaluator(k=K)

    # -------------------------------
    # Split 70/15/15
    # -------------------------------
    train_user_book, val_user_book, test_user_book = split_user_interactions(
        user_book
    )

    ground_truth = build_ground_truth(test_user_book)
    test_users = list(ground_truth.keys())

    print(f"Users in test set: {len(test_users)}")

    # =====================================================
    # ITEM-BASED CF
    # =====================================================

    print("\nTraining Item-Based CF...")

    item_cf = ItemBasedCFRecommender(
        train_user_book=train_user_book,
        k_neighbors=50,
    )
    item_cf.fit()

    item_cf_preds = generate_predictions(item_cf, test_users, K)
    item_cf_metrics = evaluator.evaluate(item_cf_preds, ground_truth)

    print("\n===== Item-Based CF Results =====")
    for m, v in item_cf_metrics.items():
        print(f"{m}: {v:.4f}")


if __name__ == "__main__":
    main()

Total users with interactions: 63721
Users in test set: 15427

Training Item-Based CF...
Building Item-Based CF (Sparse, train-only)...
Building sparse interaction matrix...


100%|████████████████████████████████████████████████████████████████████████| 63721/63721 [00:00<00:00, 778150.84it/s]


Computing item-item similarity (this may take 5–15 minutes)...


Generating Predictions: 100%|███████████████████████████████████████████████████| 15427/15427 [00:49<00:00, 314.14it/s]
Evaluating: 100%|████████████████████████████████████████████████████████████| 15427/15427 [00:00<00:00, 278550.15it/s]


===== Item-Based CF Results =====
Hit@10: 0.1710
MRR@10: 0.1223
NDCG@10: 0.1106



