# Explicit Matrix Factorization (SVD)
**Bangla Book Recommendation Dataset**

‚ñ∂Ô∏è **Just click `Runtime ‚Üí Run all` to get started!**

In [3]:
# ============================================================
# STEP 1: Install dependencies & download dataset from HuggingFace
# ============================================================
import os, shutil

print("üì¶ Installing required packages...")
# scikit-surprise requires NumPy < 2.0 ‚Äî must downgrade first
os.system("pip install -q 'numpy<2' scikit-surprise huggingface_hub tqdm pandas")

print("\n‚ö†Ô∏è  NumPy was downgraded. Restarting kernel to apply changes...")
import IPython
IPython.Application.instance().kernel.do_shutdown(restart=True)

üì¶ Installing required packages...

‚ö†Ô∏è  NumPy was downgraded. Restarting kernel to apply changes...


{'status': 'ok', 'restart': True}

In [1]:
# ============================================================
# STEP 2: Download dataset from HuggingFace
# (Run this cell after the kernel restarts from Step 1)
# ============================================================
import os, shutil
from huggingface_hub import hf_hub_download

REPO_ID = "DevnilMaster1/Bangla-Book-Recommendation-Dataset"
DATA_FOLDER = "RokomariBG_Dataset"
os.makedirs(DATA_FOLDER, exist_ok=True)

FILES_NEEDED = ["user_to_review.json", "book_to_review.json", "review.json"]

for filename in FILES_NEEDED:
    dest = os.path.join(DATA_FOLDER, filename)
    if os.path.exists(dest):
        print(f"‚úÖ Already downloaded: {filename}")
    else:
        print(f"‚¨áÔ∏è  Downloading {filename} ...")
        downloaded_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=filename,
            repo_type="dataset",
        )
        shutil.copy(downloaded_path, dest)
        print(f"‚úÖ Saved: {dest}")

print("\nüéâ All files ready!")

‚úÖ Already downloaded: user_to_review.json
‚úÖ Already downloaded: book_to_review.json
‚úÖ Already downloaded: review.json

üéâ All files ready!


In [2]:
import json
from collections import defaultdict
from typing import Dict, List, Set
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from surprise import Dataset, Reader, SVD

In [3]:
def load_json(path: str):
    """Load JSON file ‚Äî supports both plain .json and gzip .json.gz"""
    import json
    if path.endswith(".gz"):
        import gzip
        with gzip.open(path, "rt", encoding="utf-8") as f:
            return json.load(f)
    else:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

In [4]:
class ExplicitMFRecommender():
    """Explicit Matrix Factorization using SVD (via Surprise). Train-only ratings."""

    def __init__(self, train_user_book_rating: Dict[str, Dict[str, float]]):
        self.train_data = train_user_book_rating

    def fit(self):
        print("Training Explicit MF (SVD)...")
        records = []
        for user, book_dict in self.train_data.items():
            for book, rating in book_dict.items():
                records.append([user, book, float(rating)])

        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(
            pd.DataFrame(records, columns=["user", "item", "rating"]), reader)
        trainset = self.data.build_full_trainset()
        self.model = SVD(random_state=42)
        self.model.fit(trainset)
        self.all_items = set(b for u in self.train_data for b in self.train_data[u])

    def recommend(self, user_id: str, k: int = 10, n_candidates: int = 1000):
        if user_id not in self.train_data:
            return []
        seen = set(self.train_data[user_id].keys())
        unseen_items = list(self.all_items - seen)
        candidates = random.sample(unseen_items, n_candidates) if len(unseen_items) > n_candidates else unseen_items
        scores = [(book, self.model.predict(user_id, book).est) for book in candidates]
        scores.sort(key=lambda x: x[1], reverse=True)
        return [b for b, _ in scores[:k]]

In [5]:
class RankingEvaluator:
    def hit_at_k(self, predictions, ground_truth, k):
        return 1.0 if any(item in ground_truth for item in predictions[:k]) else 0.0

    def mrr_at_k(self, predictions, ground_truth, k):
        for rank, item in enumerate(predictions[:k], start=1):
            if item in ground_truth:
                return 1.0 / rank
        return 0.0

    def dcg_at_k(self, predictions, ground_truth, k):
        return sum(1.0 / np.log2(r+1) for r, item in enumerate(predictions[:k], 1) if item in ground_truth)

    def idcg_at_k(self, ground_truth, k):
        return sum(1.0 / np.log2(r+1) for r in range(1, min(len(ground_truth), k)+1))

    def ndcg_at_k(self, predictions, ground_truth, k):
        idcg = self.idcg_at_k(ground_truth, k)
        return self.dcg_at_k(predictions, ground_truth, k) / idcg if idcg > 0 else 0.0

    def evaluate(self, predictions, ground_truth):
        metrics = {k: [] for k in ['Hit@5','Hit@10','Hit@50','MRR@10','NDCG@10','NDCG@50']}
        for uid in set(predictions) & set(ground_truth):
            p, g = predictions[uid], ground_truth[uid]
            if not g: continue
            metrics['Hit@5'].append(self.hit_at_k(p, g, 5))
            metrics['Hit@10'].append(self.hit_at_k(p, g, 10))
            metrics['Hit@50'].append(self.hit_at_k(p, g, 50))
            metrics['MRR@10'].append(self.mrr_at_k(p, g, 10))
            metrics['NDCG@10'].append(self.ndcg_at_k(p, g, 10))
            metrics['NDCG@50'].append(self.ndcg_at_k(p, g, 50))
        return {k: np.mean(v) if v else 0.0 for k, v in metrics.items()}

In [6]:
def build_user_book_interactions(user_to_review_path, book_to_review_path):
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)
    review_to_user = {str(x["review_id"]): str(x["user_id"]) for x in user_to_review}
    review_to_book = {str(x["review_id"]): str(x["book_id"]) for x in book_to_review}
    user_book = defaultdict(list)
    for rid, uid in review_to_user.items():
        if rid in review_to_book:
            user_book[uid].append(review_to_book[rid])
    return user_book

def split_user_interactions(user_book, seed=42):
    random.seed(seed)
    train, val, test = {}, {}, {}
    for user, books in user_book.items():
        books = list(set(books)); random.shuffle(books); n = len(books)
        if n < 3:
            train[user], val[user], test[user] = books, [], []
        else:
            nt, nv = int(0.7*n), int(0.15*n)
            train[user], val[user], test[user] = books[:nt], books[nt:nt+nv], books[nt+nv:]
    return train, val, test

def build_ground_truth(test_user_book):
    return {u: set(b) for u, b in test_user_book.items() if b}

def generate_predictions(model, users, k):
    predictions = {}
    for u in tqdm(users, desc="Generating Predictions"):
        predictions[u] = model.recommend(u, k)
    return predictions

def build_train_user_book_rating(train_user_book, user_to_review_path, book_to_review_path, review_path):
    user_to_review = load_json(user_to_review_path)
    book_to_review = load_json(book_to_review_path)
    reviews = load_json(review_path)
    review_to_user = {str(x["review_id"]): str(x["user_id"]) for x in user_to_review}
    review_to_book = {str(x["review_id"]): str(x["book_id"]) for x in book_to_review}
    # Support both "user_rating" and "userRating" key names
    def get_rating(x):
        return float(x.get("user_rating", x.get("userRating", 0)))
    review_to_rating = {str(x["review_id"]): get_rating(x) for x in reviews}
    train_ratings = defaultdict(dict)
    for rid, user in review_to_user.items():
        if rid in review_to_book and rid in review_to_rating:
            book = review_to_book[rid]
            rating = review_to_rating[rid]
            if rating > 0 and user in train_user_book and book in train_user_book[user]:
                train_ratings[user][book] = rating
    return train_ratings

In [7]:
def main():
    DATA_FOLDER = "RokomariBG_Dataset/"
    USER_TO_REVIEW = DATA_FOLDER + "user_to_review.json"
    BOOK_TO_REVIEW = DATA_FOLDER + "book_to_review.json"
    REVIEW         = DATA_FOLDER + "review.json"
    K = 10

    user_book = build_user_book_interactions(USER_TO_REVIEW, BOOK_TO_REVIEW)
    print(f"Total users with interactions: {len(user_book)}")

    evaluator = RankingEvaluator()
    train_user_book, val_user_book, test_user_book = split_user_interactions(user_book)
    ground_truth = build_ground_truth(test_user_book)
    test_users = list(ground_truth.keys())
    print(f"Users in test set: {len(test_users)}")

    print("\nBuilding rating matrix from training data...")
    train_user_book_rating = build_train_user_book_rating(
        train_user_book=train_user_book,
        user_to_review_path=USER_TO_REVIEW,
        book_to_review_path=BOOK_TO_REVIEW,
        review_path=REVIEW,
    )

    print("\nTraining Explicit MF (SVD)...")
    explicit_mf = ExplicitMFRecommender(train_user_book_rating)
    explicit_mf.fit()

    explicit_preds = generate_predictions(explicit_mf, test_users, K)
    explicit_metrics = evaluator.evaluate(explicit_preds, ground_truth)

    print("\n===== Explicit MF Results =====")
    for m, v in explicit_metrics.items():
        print(f"{m}: {v:.4f}")

main()

Total users with interactions: 63721
Users in test set: 15427

Building rating matrix from training data...

Training Explicit MF (SVD)...
Training Explicit MF (SVD)...


Generating Predictions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15427/15427 [01:58<00:00, 130.64it/s]



===== Explicit MF Results =====
Hit@5: 0.0029
Hit@10: 0.0052
Hit@50: 0.0052
MRR@10: 0.0017
NDCG@10: 0.0016
NDCG@50: 0.0016
