In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
from typing import Dict, List, Set

class RankingEvaluator:
    """
    Evaluates ranking-based recommendation metrics.
    Computes: Hit@K, MRR@K, NDCG@K for various K values.
    """
    
    def __init__(self):
        pass
    
    def hit_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        Hit@K: 1 if at least one relevant item is in top-K, else 0
        """
        top_k = predictions[:k]
        return 1.0 if any(item in ground_truth for item in top_k) else 0.0
    
    def mrr_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        MRR@K: Reciprocal rank of first relevant item in top-K
        """
        top_k = predictions[:k]
        for rank, item in enumerate(top_k, start=1):
            if item in ground_truth:
                return 1.0 / rank
        return 0.0
    
    def dcg_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        DCG@K: Discounted Cumulative Gain
        """
        dcg = 0.0
        top_k = predictions[:k]
        for rank, item in enumerate(top_k, start=1):
            if item in ground_truth:
                dcg += 1.0 / np.log2(rank + 1)
        return dcg
    
    def idcg_at_k(self, ground_truth: Set[str], k: int) -> float:
        """
        IDCG@K: Ideal DCG (best possible DCG)
        """
        ideal_k = min(len(ground_truth), k)
        idcg = sum(1.0 / np.log2(rank + 1) for rank in range(1, ideal_k + 1))
        return idcg
    
    def ndcg_at_k(self, predictions: List[str], ground_truth: Set[str], k: int) -> float:
        """
        NDCG@K: Normalized Discounted Cumulative Gain
        """
        dcg = self.dcg_at_k(predictions, ground_truth, k)
        idcg = self.idcg_at_k(ground_truth, k)
        
        if idcg == 0.0:
            return 0.0
        
        return dcg / idcg
    
    def evaluate(self, predictions: Dict[str, List[str]], ground_truth: Dict[str, Set[str]]) -> Dict[str, float]:
        """
        Evaluate predictions against ground truth.
        
        Args:
            predictions: Dict mapping user_id -> list of recommended item_ids (ranked)
            ground_truth: Dict mapping user_id -> set of relevant item_ids
            
        Returns:
            Dict of metric_name -> average_score
        """
        metrics = {
            'Hit@5': [],
            'Hit@10': [],
            'Hit@50': [],
            'MRR@10': [],
            'NDCG@10': [],
            'NDCG@50': []
        }
        
        # Only evaluate users present in both predictions and ground_truth
        common_users = set(predictions.keys()) & set(ground_truth.keys())
        
        for user_id in common_users:
            preds = predictions[user_id]
            gt = ground_truth[user_id]
            
            # Skip users with no ground truth items
            if len(gt) == 0:
                continue
            
            # Compute metrics
            metrics['Hit@5'].append(self.hit_at_k(preds, gt, 5))
            metrics['Hit@10'].append(self.hit_at_k(preds, gt, 10))
            metrics['Hit@50'].append(self.hit_at_k(preds, gt, 50))
            metrics['MRR@10'].append(self.mrr_at_k(preds, gt, 10))
            metrics['NDCG@10'].append(self.ndcg_at_k(preds, gt, 10))
            metrics['NDCG@50'].append(self.ndcg_at_k(preds, gt, 50))
        
        # Average across all users
        results = {}
        for metric_name, values in metrics.items():
            if len(values) > 0:
                results[metric_name] = np.mean(values)
            else:
                results[metric_name] = 0.0
        
        return results

In [None]:
import json
import random
from collections import defaultdict
from typing import Dict, List, Set
from tqdm.auto import tqdm
import torch

import numpy as np
import torch.nn as nn
import torch.optim as optim

# Removed: from evaluation import RankingEvaluator
# The RankingEvaluator class is already defined in a previous cell.

class BaseRecommender:
    def fit(self):
        raise NotImplementedError

    def recommend(self, user_id: str, k: int = 10) -> List[str]:
        raise NotImplementedError

# ============================================================
# 7. LightGCN — SPARSE, MEMORY-SAFE
# ============================================================

class LightGCNRecommender(BaseRecommender):
    """
    Proper LightGCN baseline:
    - Builds sparse normalized adjacency A_hat (symmetric normalized)
    - Propagates embeddings for n_layers
    - Uses BPR loss
    - Uses train_user_book only (implicit)
    """

    def __init__(
        self,
        train_user_book: Dict[str, List[str]],
        n_factors: int = 64,
        n_layers: int = 3,
        n_epochs: int = 10,
        lr: float = 0.01,
        reg: float = 1e-4,
        n_neg: int = 1,
        seed: int = 42,
        device: str = None,
        batch_size: int = 4096,
        neg_per_user: int = 1,
    ):
        self.train_user_book = {u: list(set(b)) for u, b in train_user_book.items()}
        self.n_factors = n_factors
        self.n_layers = n_layers
        self.n_epochs = n_epochs
        self.lr = lr
        self.reg = reg
        self.n_neg = n_neg
        self.rng = np.random.default_rng(seed)
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        # Filled in fit()
        self.user_ids = []
        self.book_ids = []
        self.user_id_map = {}
        self.book_id_map = {}
        self.n_users = 0
        self.n_items = 0
        self.A_hat = None
        self.user_emb = None
        self.item_emb = None
        self.final_user_emb = None
        self.final_item_emb = None
        self.batch_size = batch_size
        self.neg_per_user = neg_per_user


    # -------------------------------------------------
    # Build sparse normalized adjacency A_hat
    # -------------------------------------------------
    def _build_sparse_graph(self):
        print("Building sparse LightGCN graph...")

        self.user_ids = sorted(self.train_user_book.keys())
        self.book_ids = sorted({b for books in self.train_user_book.values() for b in books})

        self.user_id_map = {u: i for i, u in enumerate(self.user_ids)}
        self.book_id_map = {b: i for i, b in enumerate(self.book_ids)}

        U = len(self.user_ids)
        I = len(self.book_ids)
        N = U + I

        # Build COO edges for bipartite graph (u<->i)
        rows, cols = [], []
        for u, books in self.train_user_book.items():
            uidx = self.user_id_map[u]
            for b in books:
                iidx = self.book_id_map[b]
                rows.append(uidx)
                cols.append(U + iidx)
                rows.append(U + iidx)
                cols.append(uidx)

        idx = torch.tensor([rows, cols], dtype=torch.long, device=self.device)
        val = torch.ones(idx.shape[1], dtype=torch.float32, device=self.device)

        A = torch.sparse_coo_tensor(idx, val, size=(N, N), device=self.device).coalesce()

        # Degree vector: deg[v] = sum_j A[v,j]
        deg = torch.sparse.sum(A, dim=1).to_dense()
        deg_inv_sqrt = torch.pow(deg, -0.5)
        deg_inv_sqrt[torch.isinf(deg_inv_sqrt)] = 0.0

        # Build normalized values: A_hat[i,j] = A[i,j] * d_i^{-1/2} * d_j^{-1/2}
        r, c = A.indices()
        v = A.values()
        v_norm = v * deg_inv_sqrt[r] * deg_inv_sqrt[c]

        self.A_hat = torch.sparse_coo_tensor(
            torch.stack([r, c], dim=0),
            v_norm,
            size=(N, N),
            device=self.device,
        ).coalesce()

        self.n_users = U
        self.n_items = I

    # -------------------------------------------------
    # One forward propagation to get final embeddings
    # -------------------------------------------------
    def _propagate(self):
        # Initial embeddings for all nodes
        all_emb0 = torch.cat([self.user_emb.weight, self.item_emb.weight], dim=0)  # (U+I, d)

        embs = [all_emb0]
        x = all_emb0
        for _ in range(self.n_layers):
            x = torch.sparse.mm(self.A_hat, x)
            embs.append(x)

        # LightGCN uses average of layer embeddings
        all_emb = torch.mean(torch.stack(embs, dim=0), dim=0)  # (U+I, d)

        user_final = all_emb[: self.n_users]
        item_final = all_emb[self.n_users :]
        return user_final, item_final

    # -------------------------------------------------
    # Fit (BPR training)
    # -------------------------------------------------
    def fit(self):
        print(f"Training LightGCN on {self.device}...")
        self._build_sparse_graph()

        self.user_emb = nn.Embedding(self.n_users, self.n_factors).to(self.device)
        self.item_emb = nn.Embedding(self.n_items, self.n_factors).to(self.device)
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)

        optimizer = optim.Adam(
            list(self.user_emb.parameters()) + list(self.item_emb.parameters()),
            lr=self.lr,
        )

        # For faster negative sampling
        user_pos_items = {
            self.user_id_map[u]: set(self.book_id_map[b] for b in books if b in self.book_id_map)
            for u, books in self.train_user_book.items()
            if u in self.user_id_map
        }

        users_list = np.array(list(user_pos_items.keys()), dtype=np.int64)
        n_users_train = len(users_list)

        for epoch in range(1, self.n_epochs + 1):
            self.user_emb.train()
            self.item_emb.train()

            # Shuffle users each epoch
            self.rng.shuffle(users_list)

            total_loss = 0.0
            n_steps = 0

            pbar = tqdm(range(0, n_users_train, self.batch_size), desc=f"LightGCN Epoch {epoch}")
            for start in pbar:
                batch_users = users_list[start : start + self.batch_size]
                if len(batch_users) == 0:
                    continue

                # ---- sample positives ----
                pos_items = []
                for u in batch_users:
                    pos_set = user_pos_items[int(u)]
                    pos_items.append(int(self.rng.choice(list(pos_set))))
                pos_items = np.array(pos_items, dtype=np.int64)

                # ---- sample negatives (vectorized-ish, with rejection) ----
                neg_items = np.empty((len(batch_users), self.neg_per_user), dtype=np.int64)
                for i, u in enumerate(batch_users):
                    pos_set = user_pos_items[int(u)]
                    for j in range(self.neg_per_user):
                        neg = int(self.rng.integers(0, self.n_items))
                        while neg in pos_set:
                            neg = int(self.rng.integers(0, self.n_items))
                        neg_items[i, j] = neg

                # move to GPU
                batch_users_t = torch.tensor(batch_users, device=self.device, dtype=torch.long)
                pos_items_t = torch.tensor(pos_items, device=self.device, dtype=torch.long)
                neg_items_t = torch.tensor(neg_items, device=self.device, dtype=torch.long)  # (B, neg)

                # ---- propagate ONCE per batch ----
                user_final, item_final = self._propagate()

                uvec = user_final[batch_users_t]                 # (B, d)
                pvec = item_final[pos_items_t]                   # (B, d)
                nvec = item_final[neg_items_t]                   # (B, neg, d)

                # scores
                pos_scores = torch.sum(uvec * pvec, dim=1, keepdim=True)     # (B, 1)
                neg_scores = torch.sum(uvec.unsqueeze(1) * nvec, dim=2)      # (B, neg)

                # BPR loss
                # maximize log sigmoid(pos - neg)
                bpr = -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-12))

                # reg loss on *raw* embeddings (standard)
                reg_val = self.reg * (
                    self.user_emb(batch_users_t).norm(2).pow(2)
                    + self.item_emb(pos_items_t).norm(2).pow(2)
                    + self.item_emb(neg_items_t.view(-1)).norm(2).pow(2)
                ) / len(batch_users)

                loss = bpr + reg_val

                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                optimizer.step()

                total_loss += float(loss.item())
                n_steps += 1
                pbar.set_postfix(loss=f"{total_loss / n_steps:.5f}")

            print(f"Epoch {epoch}: avg_loss={total_loss / max(n_steps,1):.6f}")


        # Cache final embeddings for fast recommend
        self.user_emb.eval()
        self.item_emb.eval()
        with torch.no_grad():
            self.final_user_emb, self.final_item_emb = self._propagate()

    # -------------------------------------------------
    # Recommend
    # -------------------------------------------------
    def recommend(self, user_id: str, k: int = 10) -> List[str]:
        if user_id not in self.user_id_map:
            return []

        uidx = self.user_id_map[user_id]

        with torch.no_grad():
            uvec = self.final_user_emb[uidx]  # (d,)
            scores = torch.matmul(self.final_item_emb, uvec)  # (I,)

        scores = scores.detach().cpu().numpy()

        # filter items seen in TRAIN
        seen = set(self.book_id_map[b] for b in self.train_user_book[user_id] if b in self.book_id_map)
        if seen:
            scores[list(seen)] = -1e9

        topk = np.argpartition(scores, -k)[-k:]
        topk = topk[np.argsort(scores[topk])[::-1]]
        return [self.book_ids[i] for i in topk]


def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


# -------------------------------------------------
# 1. Build User → Book Interaction Table
# -------------------------------------------------

def build_user_book_interactions(
    user_to_review_path: str,
    book_to_review_path: str,
) -> Dict[str, List[str]]:
    """
    Builds:
        user_id -> [book_id, book_id, ...]
    """
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)

    # Normalize IDs to string
    review_to_user = {
        str(x["review_id"]): str(x["user_id"])
        for x in user_to_review
    }

    review_to_book = {
        str(x["review_id"]): str(x["book_id"])
        for x in book_to_review
    }

    user_book = defaultdict(list)

    for rid, user_id in review_to_user.items():
        if rid in review_to_book:
            book_id = review_to_book[rid]
            user_book[user_id].append(book_id)

    return user_book


# -------------------------------------------------
# 2. Per-User 70/15/15 Split
# -------------------------------------------------

def split_user_interactions(
    user_book: Dict[str, List[str]],
    seed: int = 42,
):
    """
    Returns:
        train_user_book
        val_user_book
        test_user_book
    """
    random.seed(seed)

    train = {}
    val = {}
    test = {}

    for user, books in user_book.items():
        books = list(set(books))  # remove duplicates
        random.shuffle(books)

        n = len(books)
        if n < 3:
            train[user] = books
            val[user] = []
            test[user] = []
            continue

        n_train = int(0.7 * n)
        n_val = int(0.15 * n)

        train[user] = books[:n_train]
        val[user] = books[n_train : n_train + n_val]
        test[user] = books[n_train + n_val :]

    return train, val, test


# -------------------------------------------------
# 3. Convert Test Set to Ground Truth Format
# -------------------------------------------------

def build_ground_truth(test_user_book: Dict[str, List[str]]) -> Dict[str, Set[str]]:
    return {
        user: set(books)
        for user, books in test_user_book.items()
        if len(books) > 0
    }


# -------------------------------------------------
# 4. Generate Predictions for All Users
# -------------------------------------------------

def generate_predictions(model, users: List[str], k: int):
    predictions = {}

    for u in tqdm(users, desc="Generating Predictions"):
        preds = model.recommend(u, k)
        predictions[u] = preds

    return predictions


# -------------------------------------------------
# 5. Main Pipeline
# ------------------------------------------------- #

def main():
    # ------------------------------- #
    # Paths - Updated for Google Colab #
    # ------------------------------- #
    USER_TO_REVIEW = "/content/drive/MyDrive/RokomariBG_Dataset/user_to_review.json"
    BOOK_TO_REVIEW = "/content/drive/MyDrive/RokomariBG_Dataset/book_to_review.json"

    K = 10

    # ------------------------------- #
    # Build user-book interactions     #
    # ------------------------------- #
    print("Loading data...")
    user_book = build_user_book_interactions(
        USER_TO_REVIEW,
        BOOK_TO_REVIEW,
    )

    print(f"Total users with interactions: {len(user_book)}")

    # Use the RankingEvaluator class defined in previous cell
    evaluator = RankingEvaluator()

    # ------------------------------- #
    # Split 70/15/15                   #
    # ------------------------------- #
    print("Splitting data into train/val/test...")
    train_user_book, val_user_book, test_user_book = split_user_interactions(
        user_book
    )

    ground_truth = build_ground_truth(test_user_book)
    test_users = list(ground_truth.keys())

    print(f"Users in test set: {len(test_users)}")

    # =====================================================
    # LIGHTGCN - Updated parameters
    # =====================================================

    print("\nTraining LightGCN...")

    lightgcn = LightGCNRecommender(
        train_user_book=train_user_book,
        n_factors=64,      # Updated to 64
        n_layers=2,        # Updated to 2
        n_epochs=10,
        lr=0.01,
        reg=1e-4,
        batch_size=4096,
        neg_per_user=1,
        seed=42
    )

    lightgcn.fit()

    print("\nGenerating recommendations for test users...")
    lightgcn_preds = generate_predictions(lightgcn, test_users, K)

    print("Evaluating results...")
    lightgcn_metrics = evaluator.evaluate(lightgcn_preds, ground_truth)

    print("\n===== LightGCN Results =====")
    for m, v in lightgcn_metrics.items():
        print(f"{m}: {v:.4f}")


# -------------------------------------------------
# Entry Point
# -------------------------------------------------

if __name__ == "__main__":
    main()