In [None]:
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

#pip install dgl -f https://data.dgl.ai/wheels/cu121/repo.html

In [None]:
# CELL 1: Mock DGL Graphbolt
# ==============================================================================
import sys
from unittest.mock import MagicMock
sys.modules['dgl.graphbolt'] = MagicMock()

In [None]:
# ========== CELL 2: Imports & GPU Setup ==========
import os
import gc
import json
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F

# GPU Memory Management
def clear_gpu_memory():
    """Clear GPU cache"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def print_gpu_memory():
    """Print current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {total:.2f}GB total")

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print_gpu_memory()

# Import DGL after torch
import dgl
from dgl.nn import HeteroGraphConv, GraphConv
print(f"DGL version: {dgl.__version__}")

clear_gpu_memory()

In [None]:
# ========== CELL 3: Configuration ==========
class Config:
    """Configuration for FULL DATA on RTX 4060 Laptop GPU (8GB VRAM)"""
    
    # [!] UPDATE THIS PATH!
    BASE_INPUT_PATH = 'E:/RokomariBG_Dataset'
    
    # === FULL DATA SETTINGS ===
    MAX_BOOKS = None  # Use ALL books
    MAX_AUTHORS = None  # Use ALL authors
    MAX_USERS = None  # Use ALL users
    MAX_CATEGORIES = None  # Use ALL categories
    MAX_PUBLISHERS = None  # Use ALL publishers
    
    # === MODEL SETTINGS ===
    EMBED_DIM = 64
    HIDDEN_DIM = 128
    OUT_DIM = 64
    NUM_LAYERS = 2
    DROPOUT = 0.2
    
    # === TRAINING SETTINGS ===
    LEARNING_RATE = 0.001
    WEIGHT_DECAY = 1e-5
    EPOCHS = 50
    BATCH_SIZE = 512
    NEG_SAMPLES = 4
    
    # === EARLY STOPPING ===
    PATIENCE = 15
    MIN_DELTA = 0.0005
    
    # === SPLIT RATIOS ===
    TRAIN_RATIO = 0.70
    VAL_RATIO = 0.15
    TEST_RATIO = 0.15
    
    # === MEMORY MANAGEMENT ===
    EVAL_EVERY = 2
    CLEAR_CACHE_EVERY = 3
    
    # === OUTPUT ===
    OUTPUT_DIR = 'outputs'

config = Config()
os.makedirs(config.OUTPUT_DIR, exist_ok=True)

# Define paths
paths = {
    'book': f'{config.BASE_INPUT_PATH}/book.json',
    'author': f'{config.BASE_INPUT_PATH}/author.json',
    'category': f'{config.BASE_INPUT_PATH}/category.json',
    'publisher': f'{config.BASE_INPUT_PATH}/publisher.json',
    'review': f'{config.BASE_INPUT_PATH}/review.json',
    'book_to_author': f'{config.BASE_INPUT_PATH}/book_to_author.json',
    'book_to_publisher': f'{config.BASE_INPUT_PATH}/book_to_publisher.json',
    'book_to_category': f'{config.BASE_INPUT_PATH}/book_to_category.json',
    'author_to_category': f'{config.BASE_INPUT_PATH}/author_to_category.json',
    'author_to_publisher': f'{config.BASE_INPUT_PATH}/author_to_publisher.json',
    'publisher_to_category': f'{config.BASE_INPUT_PATH}/publisher_to_category.json',
    'user_to_review': f'{config.BASE_INPUT_PATH}/user_to_review.json',
    'book_to_review': f'{config.BASE_INPUT_PATH}/book_to_review.json',
}

print("[OK] Configuration loaded")
print(f"Data path: {config.BASE_INPUT_PATH}")
print(f"Output directory: {config.OUTPUT_DIR}")

In [None]:
# ========== CELL 4: Data Loading Functions ==========
def load_json_file(filepath):
    """Load a single JSON file"""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

print("[OK] Data loading functions defined")

In [None]:
# ========== CELL 5: Data Loader Class (NO LEAKAGE - WITH REVIEWS) ==========
class HGNNDataLoader:
    """Data loader - graph built ONLY from training edges (NO LEAKAGE)"""
    
    def __init__(self, paths, config):
        self.paths = paths
        self.config = config
        self.entity_maps = {}
        self.reverse_maps = {}
        self.entity_data = {}
        self.edge_data = {}
    
    def load_entity_metadata(self):
        """Load entity metadata"""
        print("\n" + "=" * 60)
        print("LOADING ENTITY METADATA")
        print("=" * 60)
        
        # Load books
        print("\n1. Loading books...")
        self.entity_data['book'] = load_json_file(self.paths['book']) or []
        print(f" [OK] {len(self.entity_data['book']):,} books loaded (may contain duplicates)")
        
        # Load authors
        print("\n2. Loading authors...")
        self.entity_data['author'] = load_json_file(self.paths['author']) or []
        print(f" [OK] {len(self.entity_data['author']):,} authors loaded")
        
        # Load categories
        print("\n3. Loading categories...")
        self.entity_data['category'] = load_json_file(self.paths['category']) or []
        print(f" [OK] {len(self.entity_data['category']):,} categories loaded")
        
        # Load publishers
        print("\n4. Loading publishers...")
        self.entity_data['publisher'] = load_json_file(self.paths['publisher']) or []
        print(f" [OK] {len(self.entity_data['publisher']):,} publishers loaded")
        
        # Load reviews
        print("\n5. Loading reviews...")
        self.entity_data['review'] = load_json_file(self.paths['review']) or []
        print(f" [OK] {len(self.entity_data['review']):,} reviews loaded")
        
        # Create mappings
        self._create_entity_mappings()
    
    def _create_entity_mappings(self):
        """Create ID mappings - DEDUPLICATES properly"""
        
        def create_mapping(data_list, id_key):
            """Generic mapping with deduplication"""
            seen_ids = set()
            unique_items = []
            for item in data_list:
                entity_id = str(item.get(id_key, ''))
                if entity_id and entity_id not in seen_ids:
                    seen_ids.add(entity_id)
                    unique_items.append(item)
            
            entity_map = {}
            reverse_map = {}
            for idx, item in enumerate(unique_items):
                entity_id = str(item.get(id_key, ''))
                entity_map[entity_id] = idx
                reverse_map[idx] = entity_id
            
            return entity_map, reverse_map, unique_items
        
        print("\nDeduplicating and creating entity mappings...")
        
        # Update entity_data with deduplicated items
        self.entity_maps['book'], self.reverse_maps['book'], self.entity_data['book'] = create_mapping(
            self.entity_data['book'], 'book_id')
        self.entity_maps['author'], self.reverse_maps['author'], self.entity_data['author'] = create_mapping(
            self.entity_data['author'], 'author_id')
        self.entity_maps['category'], self.reverse_maps['category'], self.entity_data['category'] = create_mapping(
            self.entity_data['category'], 'category_id')
        self.entity_maps['publisher'], self.reverse_maps['publisher'], self.entity_data['publisher'] = create_mapping(
            self.entity_data['publisher'], 'publisher_id')
        self.entity_maps['review'], self.reverse_maps['review'], self.entity_data['review'] = create_mapping(
            self.entity_data['review'], 'review_id')
        
        # For users: extract from user_to_review
        print("Extracting user IDs...")
        user_to_review = load_json_file(self.paths['user_to_review']) or []
        unique_users = sorted(set(str(item.get('user_id', '')) for item in user_to_review if item.get('user_id')))
        self.entity_maps['user'] = {uid: idx for idx, uid in enumerate(unique_users)}
        self.reverse_maps['user'] = {idx: uid for uid, idx in self.entity_maps['user'].items()}
        
        print(f"\n‚úÖ Deduplicated Entity Counts:")
        print(f"   Users:      {len(self.entity_maps['user']):,}")
        print(f"   Books:      {len(self.entity_maps['book']):,} (unique)")
        print(f"   Authors:    {len(self.entity_maps['author']):,}")
        print(f"   Categories: {len(self.entity_maps['category']):,}")
        print(f"   Publishers: {len(self.entity_maps['publisher']):,}")
        print(f"   Reviews:    {len(self.entity_maps['review']):,}")
    
    def load_edge_data(self):
        """Load ALL edge data (user-book interactions extracted)"""
        print("\n" + "=" * 60)
        print("LOADING EDGE DATA (FULL DATASET)")
        print("=" * 60)
        
        # User-Review-Book interactions
        print("\n1. Building User-Book interactions (via reviews)...")
        user_to_review = load_json_file(self.paths['user_to_review']) or []
        book_to_review = load_json_file(self.paths['book_to_review']) or []
        
        # Create review-to-book mapping
        print("   Creating review-to-book mapping...")
        review_to_book = {}
        for item in book_to_review:
            review_id = str(item.get('review_id', ''))
            book_id = str(item.get('book_id', ''))
            if review_id and book_id and book_id in self.entity_maps['book']:
                review_to_book[review_id] = book_id
        print(f"   [OK] {len(review_to_book):,} review-to-book mappings")
        
        # Create review-to-rating mapping
        print("   Creating review-to-rating mapping...")
        review_to_rating = {}
        for review in self.entity_data['review']:
            review_id = str(review.get('review_id', ''))
            rating = review.get('user_rating', None)
            if rating is not None:
                try:
                    rating = float(rating)
                except (ValueError, TypeError):
                    rating = 5.0
            else:
                rating = 5.0
            review_to_rating[review_id] = rating
        print(f"   [OK] {len(review_to_rating):,} review-to-rating mappings")
        
        # Build DEDUPLICATED user-book pairs
        print("   Building user-book pairs (deduplicating)...")
        user_book_set = set()
        user_book_list = []
        
        for item in tqdm(user_to_review, desc="   Processing", leave=False):
            user_id = str(item.get('user_id', ''))
            review_id = str(item.get('review_id', ''))
            
            if user_id in self.entity_maps['user'] and review_id in review_to_book:
                book_id = review_to_book[review_id]
                user_idx = self.entity_maps['user'][user_id]
                book_idx = self.entity_maps['book'][book_id]
                
                # Only add if (user, book) pair is unique
                pair = (user_idx, book_idx)
                if pair not in user_book_set:
                    user_book_set.add(pair)
                    rating = review_to_rating.get(review_id, 5.0)
                    user_book_list.append((user_idx, book_idx, rating))
        
        print(f" ‚úÖ {len(user_book_list):,} UNIQUE user-book interactions (NO FILTERING)")
        
        # Convert to edge data format
        self.edge_data['user_book'] = {
            'user_indices': np.array([p[0] for p in user_book_list]),
            'book_indices': np.array([p[1] for p in user_book_list]),
            'ratings': np.array([p[2] for p in user_book_list])
        }
        
        # Helper for loading bidirectional metadata edges
        def load_bidir_edges(filepath, src_key, dst_key, src_type, dst_type, edge_name):
            data = load_json_file(filepath) or []
            src_list, dst_list = [], []
            for item in data:
                src_id = str(item.get(src_key, ''))
                dst_id = str(item.get(dst_key, ''))
                if src_id in self.entity_maps[src_type] and dst_id in self.entity_maps[dst_type]:
                    src_list.append(self.entity_maps[src_type][src_id])
                    dst_list.append(self.entity_maps[dst_type][dst_id])
            print(f" [OK] {len(src_list):,} {edge_name} edges")
            return np.array(src_list), np.array(dst_list)
        
        # Book-Author
        print("\n2. Loading book-author edges...")
        b_src, a_dst = load_bidir_edges(
            self.paths['book_to_author'], 'book_id', 'author_id',
            'book', 'author', 'book-author'
        )
        self.edge_data['book_author'] = (b_src, a_dst)
        self.edge_data['author_book'] = (a_dst, b_src)
        
        # Book-Publisher
        print("\n3. Loading book-publisher edges...")
        b_src, p_dst = load_bidir_edges(
            self.paths['book_to_publisher'], 'book_id', 'publisher_id',
            'book', 'publisher', 'book-publisher'
        )
        self.edge_data['book_publisher'] = (b_src, p_dst)
        self.edge_data['publisher_book'] = (p_dst, b_src)
        
        # Book-Category
        print("\n4. Loading book-category edges...")
        b_src, c_dst = load_bidir_edges(
            self.paths['book_to_category'], 'book_id', 'category_id',
            'book', 'category', 'book-category'
        )
        self.edge_data['book_category'] = (b_src, c_dst)
        self.edge_data['category_book'] = (c_dst, b_src)
        
        # Author-Category
        print("\n5. Loading author-category edges...")
        a_src, c_dst = load_bidir_edges(
            self.paths['author_to_category'], 'author_id', 'category_id',
            'author', 'category', 'author-category'
        )
        self.edge_data['author_category'] = (a_src, c_dst)
        self.edge_data['category_author'] = (c_dst, a_src)
        
        # Author-Publisher
        print("\n6. Loading author-publisher edges...")
        a_src, p_dst = load_bidir_edges(
            self.paths['author_to_publisher'], 'author_id', 'publisher_id',
            'author', 'publisher', 'author-publisher'
        )
        self.edge_data['author_publisher'] = (a_src, p_dst)
        self.edge_data['publisher_author'] = (p_dst, a_src)
        
        # Publisher-Category
        print("\n7. Loading publisher-category edges...")
        p_src, c_dst = load_bidir_edges(
            self.paths['publisher_to_category'], 'publisher_id', 'category_id',
            'publisher', 'category', 'publisher-category'
        )
        self.edge_data['publisher_category'] = (p_src, c_dst)
        self.edge_data['category_publisher'] = (c_dst, p_src)
        
        # User-Review and Review-Book edges for graph
        print("\n8. Loading user-review edges...")
        user_src, review_dst = [], []
        for item in user_to_review:
            user_id = str(item.get('user_id', ''))
            review_id = str(item.get('review_id', ''))
            if user_id in self.entity_maps['user'] and review_id in self.entity_maps['review']:
                user_src.append(self.entity_maps['user'][user_id])
                review_dst.append(self.entity_maps['review'][review_id])
        self.edge_data['user_review'] = (np.array(user_src), np.array(review_dst))
        self.edge_data['review_user'] = (np.array(review_dst), np.array(user_src))
        print(f" [OK] {len(user_src):,} user-review edges")
        
        print("\n9. Loading review-book edges...")
        review_src, book_dst = [], []
        for item in book_to_review:
            review_id = str(item.get('review_id', ''))
            book_id = str(item.get('book_id', ''))
            if review_id in self.entity_maps['review'] and book_id in self.entity_maps['book']:
                review_src.append(self.entity_maps['review'][review_id])
                book_dst.append(self.entity_maps['book'][book_id])
        self.edge_data['review_book'] = (np.array(review_src), np.array(book_dst))
        self.edge_data['book_review'] = (np.array(book_dst), np.array(review_src))
        print(f" [OK] {len(review_src):,} review-book edges")
        
        return self.edge_data
    
    def build_heterograph(self, train_user_reviews, train_review_books):
        """Build graph using ONLY training edges (NO LEAKAGE)"""
        print("\n" + "=" * 60)
        print("BUILDING HETEROGENEOUS GRAPH (TRAINING EDGES ONLY)")
        print("=" * 60)
        
        graph_data = {}
        
        # USER-REVIEW edges: TRAINING ONLY (NO LEAKAGE)
        print("\nüìä User-Review edges: TRAINING ONLY (no val/test)")
        graph_data[('user', 'writes', 'review')] = (
            train_user_reviews['user'],
            train_user_reviews['review']
        )
        graph_data[('review', 'written_by', 'user')] = (
            train_user_reviews['review'],
            train_user_reviews['user']
        )
        print(f"   ‚úì {len(train_user_reviews['user']):,} training user-review edges")
        
        # REVIEW-BOOK edges: TRAINING ONLY (NO LEAKAGE)
        print("\nüìä Review-Book edges: TRAINING ONLY (no val/test)")
        graph_data[('review', 'reviews', 'book')] = (
            train_review_books['review'],
            train_review_books['book']
        )
        graph_data[('book', 'reviewed_by', 'review')] = (
            train_review_books['book'],
            train_review_books['review']
        )
        print(f"   ‚úì {len(train_review_books['review']):,} training review-book edges")
        
        # All other metadata edges (no leakage concern)
        print("\nüìö Metadata edges (all):")
        
        # Book-Author
        graph_data[('book', 'written_by', 'author')] = self.edge_data['book_author']
        graph_data[('author', 'writes', 'book')] = self.edge_data['author_book']
        print(f"   ‚úì Book-Author: {len(self.edge_data['book_author'][0]):,} edges")
        
        # Book-Publisher
        graph_data[('book', 'published_by', 'publisher')] = self.edge_data['book_publisher']
        graph_data[('publisher', 'publishes', 'book')] = self.edge_data['publisher_book']
        print(f"   ‚úì Book-Publisher: {len(self.edge_data['book_publisher'][0]):,} edges")
        
        # Book-Category
        graph_data[('book', 'belongs_to', 'category')] = self.edge_data['book_category']
        graph_data[('category', 'contains', 'book')] = self.edge_data['category_book']
        print(f"   ‚úì Book-Category: {len(self.edge_data['book_category'][0]):,} edges")
        
        # Author-Category
        graph_data[('author', 'writes_in', 'category')] = self.edge_data['author_category']
        graph_data[('category', 'has_author', 'author')] = self.edge_data['category_author']
        print(f"   ‚úì Author-Category: {len(self.edge_data['author_category'][0]):,} edges")
        
        # Author-Publisher
        graph_data[('author', 'published_with', 'publisher')] = self.edge_data['author_publisher']
        graph_data[('publisher', 'publishes_author', 'author')] = self.edge_data['publisher_author']
        print(f"   ‚úì Author-Publisher: {len(self.edge_data['author_publisher'][0]):,} edges")
        
        # Publisher-Category
        graph_data[('publisher', 'publishes_in', 'category')] = self.edge_data['publisher_category']
        graph_data[('category', 'has_publisher', 'publisher')] = self.edge_data['category_publisher']
        print(f"   ‚úì Publisher-Category: {len(self.edge_data['publisher_category'][0]):,} edges")
        
        # Create graph
        g = dgl.heterograph(graph_data)
        
        print("\n" + "=" * 60)
        print("GRAPH STRUCTURE")
        print("=" * 60)
        print(f"Node types: {g.ntypes}")
        print(f"Edge types: {len(g.canonical_etypes)}")
        print("\nNode counts:")
        for ntype in g.ntypes:
            print(f"  {ntype}: {g.num_nodes(ntype):,}")
        
        return g
    
    def extract_node_features(self, g):
        """Extract node features with safe handling"""
        print("\n" + "=" * 60)
        print("EXTRACTING NODE FEATURES")
        print("=" * 60)
        
        features = {}
        
        def safe_float(value, default=0.0):
            """Safely convert to float"""
            try:
                return max(0.0, float(value)) if value is not None else default
            except (ValueError, TypeError):
                return default
        
        # Book features (4 dims)
        print("\n1. Book features (4 dims: price, rating, review_count, pages)...")
        book_feats = []
        for i in range(g.num_nodes('book')):
            book_id = self.reverse_maps['book'].get(i, '')
            book = next((b for b in self.entity_data['book']
                        if str(b.get('book_id', '')) == book_id), None)
            
            if book:
                price = safe_float(book.get('book_price', 0))
                rating = safe_float(book.get('average_rating', 0))
                review_count = safe_float(book.get('review_count', 0))
                pages = safe_float(book.get('book_pages', 0))
                
                feats = [
                    np.log1p(price) / 10.0,
                    rating / 5.0,
                    np.log1p(review_count) / 10.0,
                    np.log1p(pages) / 10.0
                ]
            else:
                feats = [0.0, 0.0, 0.0, 0.0]
            
            book_feats.append(feats)
        
        features['book'] = torch.tensor(book_feats, dtype=torch.float32)
        print(f"   ‚úÖ {features['book'].shape}")
        
        # Author features (1 dim)
        print("\n2. Author features (1 dim: follower_count)...")
        author_feats = []
        for i in range(g.num_nodes('author')):
            author_id = self.reverse_maps['author'].get(i, '')
            author = next((a for a in self.entity_data['author']
                          if str(a.get('author_id', '')) == author_id), None)
            
            if author:
                follower_count = safe_float(author.get('follower_count', 0))
                feats = [np.log1p(follower_count) / 10.0]
            else:
                feats = [0.0]
            
            author_feats.append(feats)
        
        features['author'] = torch.tensor(author_feats, dtype=torch.float32)
        print(f"   ‚úÖ {features['author'].shape}")
        
        # Category features (1 dim)
        print("\n3. Category features (1 dim: books_count)...")
        cat_feats = []
        for i in range(g.num_nodes('category')):
            cat_id = self.reverse_maps['category'].get(i, '')
            cat = next((c for c in self.entity_data['category']
                       if str(c.get('category_id', '')) == cat_id), None)
            
            if cat:
                books_count = safe_float(cat.get('books_count', 0))
                feats = [np.log1p(books_count) / 10.0]
            else:
                feats = [0.0]
            
            cat_feats.append(feats)
        
        features['category'] = torch.tensor(cat_feats, dtype=torch.float32)
        print(f"   ‚úÖ {features['category'].shape}")
        
        # Publisher features (2 dims)
        print("\n4. Publisher features (2 dims: authors, categories)...")
        pub_feats = []
        for i in range(g.num_nodes('publisher')):
            pub_id = self.reverse_maps['publisher'].get(i, '')
            pub = next((p for p in self.entity_data['publisher']
                       if str(p.get('publisher_id', '')) == pub_id), None)
            
            if pub:
                authors_count = safe_float(pub.get('total_authors', 0))
                cats_count = safe_float(pub.get('total_categories', 0))
                feats = [np.log1p(authors_count) / 10.0, np.log1p(cats_count) / 10.0]
            else:
                feats = [0.0, 0.0]
            
            pub_feats.append(feats)
        
        features['publisher'] = torch.tensor(pub_feats, dtype=torch.float32)
        print(f"   ‚úÖ {features['publisher'].shape}")
        
        # Review features (3 dims)
        print("\n5. Review features (3 dims: rating, pos_vote, neg_vote)...")
        review_feats = []
        for i in range(g.num_nodes('review')):
            review_id = self.reverse_maps['review'].get(i, '')
            review = next((r for r in self.entity_data['review']
                          if str(r.get('review_id', '')) == review_id), None)
            
            if review:
                rating = safe_float(review.get('user_rating', 0))
                pos_vote = safe_float(review.get('positive_vote', 0))
                neg_vote = safe_float(review.get('negative_vote', 0))
                
                feats = [
                    rating / 5.0,
                    np.log1p(pos_vote) / 10.0,
                    np.log1p(neg_vote) / 10.0
                ]
            else:
                feats = [0.0, 0.0, 0.0]
            
            review_feats.append(feats)
        
        features['review'] = torch.tensor(review_feats, dtype=torch.float32)
        print(f"   ‚úÖ {features['review'].shape}")
        
        # User features (1 dim placeholder)
        print("\n6. User features (1 dim: placeholder)...")
        features['user'] = torch.zeros(g.num_nodes('user'), 1, dtype=torch.float32)
        print(f"   ‚úÖ {features['user'].shape}")
        
        print("\n" + "=" * 60)
        
        return features

print("[OK] HGNNDataLoader defined (NO LEAKAGE version with reviews)")

In [None]:
# ========== CELL 6: Model Classes ==========
class NodeFeatureProjection(nn.Module):
    """Project features to common dimension"""
    def __init__(self, feature_dims, embed_dim):
        super().__init__()
        self.projections = nn.ModuleDict()
        for ntype, dim in feature_dims.items():
            self.projections[ntype] = nn.Sequential(
                nn.Linear(dim, embed_dim),
                nn.ReLU(),
                nn.Dropout(0.1)
            )
    
    def forward(self, features):
        return {k: self.projections[k](v) for k, v in features.items()}

class HeteroRGCNLayer(nn.Module):
    """Heterogeneous R-GCN layer"""
    def __init__(self, in_dim, out_dim, rel_names, dropout=0.0):
        super().__init__()
        self.conv = HeteroGraphConv({
            rel: GraphConv(in_dim, out_dim, norm='both', allow_zero_in_degree=True)
            for rel in rel_names
        }, aggregate='mean')
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, g, h):
        h_out = self.conv(g, h)
        return {k: self.dropout(F.relu(v)) if v is not None else v for k, v in h_out.items()}

class LightweightHGNN(nn.Module):
    """Lightweight HGNN encoder"""
    def __init__(self, g, feature_dims, embed_dim=64, hidden_dim=128,
                 out_dim=64, num_layers=2, dropout=0.2):
        super().__init__()
        self.ntypes = g.ntypes
        
        # Feature projection
        self.feature_proj = NodeFeatureProjection(feature_dims, embed_dim)
        
        # Learnable embeddings
        self.embeddings = nn.ModuleDict()
        for ntype in g.ntypes:
            self.embeddings[ntype] = nn.Embedding(g.num_nodes(ntype), embed_dim)
        
        # Combine layer
        self.combine = nn.ModuleDict()
        for ntype in g.ntypes:
            self.combine[ntype] = nn.Linear(embed_dim * 2, hidden_dim)
        
        # R-GCN layers
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            out_d = out_dim if i == num_layers - 1 else hidden_dim
            self.layers.append(HeteroRGCNLayer(hidden_dim, out_d, g.canonical_etypes, dropout))
    
    def forward(self, g, features):
        # Project features
        h_feat = self.feature_proj(features)
        
        # Get learnable embeddings
        h_emb = {}
        for ntype in self.ntypes:
            indices = torch.arange(g.num_nodes(ntype), device=features[ntype].device)
            h_emb[ntype] = self.embeddings[ntype](indices)
        
        # Combine
        h = {}
        for ntype in self.ntypes:
            combined = torch.cat([h_feat[ntype], h_emb[ntype]], dim=-1)
            h[ntype] = F.relu(self.combine[ntype](combined))
        
        # GNN layers
        for layer in self.layers:
            h = layer(g, h)
        
        return h

class NegativeSampler:
    """Efficient negative sampler"""
    def __init__(self, num_books, train_data):
        self.num_books = num_books
        self.user_pos = defaultdict(set)
        for u, b in zip(train_data['user'], train_data['book']):
            self.user_pos[int(u)].add(int(b))
    
    def sample(self, user_batch, num_neg):
        neg_samples = []
        for u in user_batch.cpu().numpy():
            pos_items = self.user_pos[int(u)]
            negs = []
            while len(negs) < num_neg:
                neg = np.random.randint(0, self.num_books)
                if neg not in pos_items:
                    negs.append(neg)
            neg_samples.append(negs)
        return torch.tensor(neg_samples, dtype=torch.long)

class BookRecommendationModel(nn.Module):
    """Complete recommendation model"""
    def __init__(self, g, feature_dims, config):
        super().__init__()
        self.encoder = LightweightHGNN(
            g, feature_dims,
            embed_dim=config.EMBED_DIM,
            hidden_dim=config.HIDDEN_DIM,
            out_dim=config.OUT_DIM,
            num_layers=config.NUM_LAYERS,
            dropout=config.DROPOUT
        )
    
    def forward(self, g, features, user_ids, pos_book_ids, neg_book_ids):
        h = self.encoder(g, features)
        
        user_emb = h['user'][user_ids]
        pos_emb = h['book'][pos_book_ids]
        neg_emb = h['book'][neg_book_ids]
        
        pos_score = (user_emb.unsqueeze(1) * pos_emb.unsqueeze(1)).sum(dim=-1)
        neg_score = (user_emb.unsqueeze(1) * neg_emb).sum(dim=-1)
        
        return pos_score, neg_score
    
    def get_all_embeddings(self, g, features):
        return self.encoder(g, features)

print("[OK] Model classes defined")

In [None]:
# ========== CELL 7: Training Utilities (NO FILTERING - ALL USERS) ==========
def split_edges_user_based(edge_data, train_ratio=0.70, val_ratio=0.15, test_ratio=0.15):
    """Split edges by users - NO FILTERING, ALL USERS INCLUDED"""
    print("\n" + "=" * 60)
    print("SPLITTING DATA (ALL USERS - NO FILTERING)")
    print("=" * 60)
    
    # Group by users
    user_items = defaultdict(list)
    for i in range(len(edge_data['user_indices'])):
        user_idx = edge_data['user_indices'][i]
        book_idx = edge_data['book_indices'][i]
        rating = edge_data['ratings'][i]
        user_items[user_idx].append((book_idx, rating))
    
       # Filter users with at least 3 interactions
    valid_users = [u for u, items in user_items.items() if len(items) >= 3]
    print(f"\nüìä Total users: {len(valid_users):,} (‚â•3 interactions)")
    
    if len(valid_users) == 0:
        raise ValueError("No users with at least 3 interactions found!")
    
    # Shuffle and split users
    random.shuffle(valid_users)
    n_train = int(len(valid_users) * train_ratio)
    n_val = int(len(valid_users) * val_ratio)
    
    train_users = valid_users[:n_train]
    val_users = valid_users[n_train:n_train + n_val]
    test_users = valid_users[n_train + n_val:]
    
    print(f"\nüéØ User Split:")
    print(f"   Train users: {len(train_users):,} ({len(train_users)/len(valid_users)*100:.1f}%)")
    print(f"   Val users:   {len(val_users):,} ({len(val_users)/len(valid_users)*100:.1f}%)")
    print(f"   Test users:  {len(test_users):,} ({len(test_users)/len(valid_users)*100:.1f}%)")
    # Create splits
    def create_split(users):
        user_list, book_list, rating_list = [], [], []
        for u in users:
            for b, r in user_items[u]:
                user_list.append(u)
                book_list.append(b)
                rating_list.append(r)
        return {
            'user': np.array(user_list),
            'book': np.array(book_list),
            'rating': np.array(rating_list)
        }
    
    splits = {
        'train': create_split(train_users),
        'val': create_split(val_users),
        'test': create_split(test_users)
    }
    
    print("\nüìà Interaction Counts:")
    print(f"   Train: {len(splits['train']['user']):,}")
    print(f"   Val:   {len(splits['val']['user']):,}")
    print(f"   Test:  {len(splits['test']['user']):,}")
    
    return splits

def get_per_user_items(splits):
    """Convert splits to per-user dictionaries"""
    def to_dict(split):
        user_dict = defaultdict(list)
        for u, b, r in zip(split['user'], split['book'], split['rating']):
            user_dict[int(u)].append((int(b), float(r)))
        return dict(user_dict)
    
    return (
        to_dict(splits['train']),
        to_dict(splits['val']),
        to_dict(splits['test'])
    )

def create_train_graph_edges(splits, loader):
    """Create training-only graph edges (user-review-book path)"""
    
    # Get train user-book pairs
    train_user_book = set(zip(splits['train']['user'], splits['train']['book']))
    
    # Filter user-review edges to training users only
    user_to_review = load_json_file(loader.paths['user_to_review']) or []
    book_to_review = load_json_file(loader.paths['book_to_review']) or []
    
    # Create review-to-book mapping
    review_to_book_id = {}
    for item in book_to_review:
        review_id = str(item.get('review_id', ''))
        book_id = str(item.get('book_id', ''))
        if review_id in loader.entity_maps['review'] and book_id in loader.entity_maps['book']:
            review_to_book_id[review_id] = loader.entity_maps['book'][book_id]
    
    # Filter user-review edges
    train_user_review_src = []
    train_user_review_dst = []
    train_review_ids = set()
    
    for item in user_to_review:
        user_id = str(item.get('user_id', ''))
        review_id = str(item.get('review_id', ''))
        
        if user_id in loader.entity_maps['user'] and review_id in loader.entity_maps['review']:
            user_idx = loader.entity_maps['user'][user_id]
            review_idx = loader.entity_maps['review'][review_id]
            book_idx = review_to_book_id.get(review_id, None)
            
            # Only include if this user-book pair is in training set
            if book_idx is not None and (user_idx, book_idx) in train_user_book:
                train_user_review_src.append(user_idx)
                train_user_review_dst.append(review_idx)
                train_review_ids.add(review_idx)
    
    # Filter review-book edges  
    train_review_book_src = []
    train_review_book_dst = []
    
    for item in book_to_review:
        review_id = str(item.get('review_id', ''))
        book_id = str(item.get('book_id', ''))
        
        if review_id in loader.entity_maps['review'] and book_id in loader.entity_maps['book']:
            review_idx = loader.entity_maps['review'][review_id]
            book_idx = loader.entity_maps['book'][book_id]
            
            # Only include reviews that are in training set
            if review_idx in train_review_ids:
                train_review_book_src.append(review_idx)
                train_review_book_dst.append(book_idx)
    
    return {
        'user_review': {
            'user': np.array(train_user_review_src),
            'review': np.array(train_user_review_dst)
        },
        'review_book': {
            'review': np.array(train_review_book_src),
            'book': np.array(train_review_book_dst)
        }
    }

print("[OK] Training utilities defined (NO filtering, ALL users)")

In [None]:
# ========== CELL 8: Evaluation Function (with NDCG@50) ==========
@torch.no_grad()
def evaluate_metrics(model, g, features, user_items, user_known, device, ks=[5, 10, 50]):
    """Evaluate recommendation metrics including NDCG@50"""
    model.eval()
    h = model.get_all_embeddings(g, features)
    
    users = list(user_items.keys())
    max_k = max(ks)
    
    hit_rates = {k: [] for k in ks}
    mrrs = []
    ndcgs = {k: [] for k in ks}
    
    batch_size = 256
    for i in range(0, len(users), batch_size):
        batch_users = users[i:i+batch_size]
        batch_indices = torch.tensor(batch_users, device=device)
        
        user_embs = h['user'][batch_indices]
        book_embs = h['book']
        scores = user_embs @ book_embs.T
        
        # Mask known items
        for j, u in enumerate(batch_users):
            known = user_known.get(u, set())
            for kb in known:
                scores[j, kb] = -float('inf')
        
        # Get top-K
        _, top_indices = torch.topk(scores, k=max_k, dim=-1)
        
        # Compute metrics
        for j, u in enumerate(batch_users):
            gt_items = set(b for b, _ in user_items[u])
            top_k_items = top_indices[j].cpu().tolist()
            
            # Hit@K
            for k in ks:
                hit_rates[k].append(1.0 if any(item in gt_items for item in top_k_items[:k]) else 0.0)
            
            # MRR
            for rank, item in enumerate(top_k_items, 1):
                if item in gt_items:
                    mrrs.append(1.0 / rank)
                    break
            else:
                mrrs.append(0.0)
            
            # NDCG@K (including NDCG@50)
            for k in ks:
                dcg = sum((1.0 / np.log2(rank + 1)) for rank, item in enumerate(top_k_items[:k], 1) if item in gt_items)
                idcg = sum(1.0 / np.log2(i + 1) for i in range(1, min(len(gt_items), k) + 1))
                ndcgs[k].append(dcg / max(idcg, 1e-8))
    
    metrics = {
        'hit_rate': {k: np.mean(v) for k, v in hit_rates.items()},
        'mrr': np.mean(mrrs),
        'ndcg': {k: np.mean(v) for k, v in ndcgs.items()}
    }
    
    return metrics

print("[OK] Evaluation function defined (with NDCG@50)")

In [None]:
# ========== CELL 9: Training Loop (with NDCG@50) ==========
def train(model, g, features, splits, user_train, user_val, user_test, config, device):
    """Training loop with NDCG@50 tracking"""
    
    # Move to device
    g = g.to(device)
    features = {k: v.to(device) for k, v in features.items()}
    
    print("\n" + "=" * 80)
    print("TRAINING")
    print("=" * 80)
    print_gpu_memory()
    
    neg_sampler = NegativeSampler(g.num_nodes('book'), splits['train'])
    optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE,
                                  weight_decay=config.WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=5
    )
    
    best_mrr = 0
    best_epoch = 0
    patience = 0
    best_state = None
    history = {
        'loss': [],
        'val_hit_rate_5': [],
        'val_hit_rate_10': [],
        'val_hit_rate_50': [],
        'val_mrr': [],
        'val_ndcg_10': [],
        'val_ndcg_50': [],  # Added NDCG@50
        'epochs': []
    }
    
    # For val: known = train
    user_known_val = {u: set(b for b, _ in user_train.get(u, [])) for u in user_val}
    
    print(f"\nüéØ Training {config.EPOCHS} epochs with early stopping (patience={config.PATIENCE})")
    print(f"üìä Best model selection: MRR (Mean Reciprocal Rank)")
    print("=" * 80 + "\n")
    
    for epoch in range(config.EPOCHS):
        # Train
        model.train()
        user_ids = torch.from_numpy(splits['train']['user']).to(device)
        book_ids = torch.from_numpy(splits['train']['book']).to(device)
        perm = torch.randperm(len(user_ids))
        user_ids, book_ids = user_ids[perm], book_ids[perm]
        
        total_loss = 0
        n_batches = 0
        for i in range(0, len(user_ids), config.BATCH_SIZE):
            batch_user = user_ids[i:i+config.BATCH_SIZE]
            batch_book = book_ids[i:i+config.BATCH_SIZE]
            neg_books = neg_sampler.sample(batch_user.cpu(), config.NEG_SAMPLES).to(device)
            
            pos_score, neg_score = model(g, features, batch_user, batch_book, neg_books)
            loss = -torch.log(torch.sigmoid(pos_score - neg_score) + 1e-8).mean()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            n_batches += 1
        
        avg_loss = total_loss / n_batches
        history['loss'].append(avg_loss)
        
        # Evaluate
        if (epoch + 1) % config.EVAL_EVERY == 0:
            val_metrics = evaluate_metrics(model, g, features, user_val, user_known_val, device)
            
            history['val_hit_rate_5'].append(val_metrics['hit_rate'][5])
            history['val_hit_rate_10'].append(val_metrics['hit_rate'][10])
            history['val_hit_rate_50'].append(val_metrics['hit_rate'][50])
            history['val_mrr'].append(val_metrics['mrr'])
            history['val_ndcg_10'].append(val_metrics['ndcg'][10])
            history['val_ndcg_50'].append(val_metrics['ndcg'][50])  # Track NDCG@50
            history['epochs'].append(epoch + 1)
            
            print(f"Epoch {epoch+1:3d} | Loss: {avg_loss:.4f} | "
                  f"Hit@5: {val_metrics['hit_rate'][5]:.4f} | "
                  f"Hit@10: {val_metrics['hit_rate'][10]:.4f} | "
                  f"MRR: {val_metrics['mrr']:.4f} | "
                  f"NDCG@10: {val_metrics['ndcg'][10]:.4f} | "
                  f"NDCG@50: {val_metrics['ndcg'][50]:.4f}")
            
            # Model selection
            if val_metrics['mrr'] > best_mrr + config.MIN_DELTA:
                best_mrr = val_metrics['mrr']
                best_epoch = epoch + 1
                patience = 0
                best_state = model.state_dict().copy()
                print(f"  ‚≠ê New best MRR: {best_mrr:.4f}")
            else:
                patience += 1
                if patience >= config.PATIENCE:
                    print(f"\n‚èπÔ∏è  Early stopping at epoch {epoch+1} (best: {best_epoch})")
                    break
            
            scheduler.step(val_metrics['mrr'])
        else:
            print(f"Epoch {epoch+1:3d} | Loss: {avg_loss:.4f}")
        
        # Memory
        if (epoch + 1) % config.CLEAR_CACHE_EVERY == 0:
            clear_gpu_memory()
    
    # Load best
    if best_state:
        model.load_state_dict(best_state)
        print(f"\n‚úÖ Loaded best model from epoch {best_epoch}")
    
    return history, best_epoch, g, features, model

print("[OK] Training loop defined (with NDCG@50)")

In [None]:
# ========== CELL 10: Visualization (with NDCG@50) ==========
def plot_training_history(history, config):
    """Plot training metrics including NDCG@50"""
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('HGNN Training History', fontsize=16, fontweight='bold')
    
    # Loss
    ax = axes[0, 0]
    ax.plot(history['loss'], linewidth=2, color='#e74c3c')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Training Loss', fontsize=12)
    ax.set_title('Training Loss', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    # Hit@5
    ax = axes[0, 1]
    ax.plot(history['epochs'], history['val_hit_rate_5'], linewidth=2, marker='o', color='#3498db')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Hit@5', fontsize=12)
    ax.set_title('Validation Hit@5', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    # Hit@10
    ax = axes[0, 2]
    ax.plot(history['epochs'], history['val_hit_rate_10'], linewidth=2, marker='o', color='#2ecc71')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Hit@10', fontsize=12)
    ax.set_title('Validation Hit@10', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    # MRR
    ax = axes[1, 0]
    ax.plot(history['epochs'], history['val_mrr'], linewidth=2, marker='o', color='#9b59b6')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('MRR', fontsize=12)
    ax.set_title('Validation MRR ‚≠ê', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    # NDCG@10
    ax = axes[1, 1]
    ax.plot(history['epochs'], history['val_ndcg_10'], linewidth=2, marker='o', color='#1abc9c')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('NDCG@10', fontsize=12)
    ax.set_title('Validation NDCG@10', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    # NDCG@50
    ax = axes[1, 2]
    ax.plot(history['epochs'], history['val_ndcg_50'], linewidth=2, marker='o', color='#f39c12')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('NDCG@50', fontsize=12)
    ax.set_title('Validation NDCG@50', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{config.OUTPUT_DIR}/training_history.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f"\n‚úÖ Plot saved: {config.OUTPUT_DIR}/training_history.png")

print("[OK] Visualization function defined (with NDCG@50)")

In [None]:
# ========== CELL 11: Test Evaluation ==========
@torch.no_grad()
def test_evaluation(model, g, features, user_test, user_train, user_val, device, loader):
    """Test set evaluation with ground truth comparison"""
    print("\n" + "=" * 80)
    print("TEST SET EVALUATION")
    print("=" * 80)
    
    # Known items = train + val
    user_known_test = {}
    for u in user_test:
        known = set(b for b, _ in user_train.get(u, []))
        known.update(b for b, _ in user_val.get(u, []))
        user_known_test[u] = known
    
    # Evaluate
    test_metrics = evaluate_metrics(model, g, features, user_test, user_known_test, device)
    
    print("\nüìä Test Set Results:")
    print(f"  Hit@5:    {test_metrics['hit_rate'][5]:.4f}")
    print(f"  Hit@10:   {test_metrics['hit_rate'][10]:.4f}")
    print(f"  Hit@50:   {test_metrics['hit_rate'][50]:.4f}")
    print(f"  MRR:      {test_metrics['mrr']:.4f}")
    print(f"  NDCG@10:  {test_metrics['ndcg'][10]:.4f}")
    print(f"  NDCG@50:  {test_metrics['ndcg'][50]:.4f}")
    
    return test_metrics

print("[OK] Test evaluation function defined")

In [None]:
# ========== CELL 12: Main Execution (NO LEAKAGE) ==========
def main():
    """Main execution pipeline - NO DATA LEAKAGE"""
    print("\n" + "=" * 80)
    print("HETEROGENEOUS GNN BOOK RECOMMENDATION SYSTEM")
    print("NO DATA LEAKAGE - Graph built from TRAINING edges only")
    print("ALL USERS INCLUDED - No filtering")
    print("=" * 80)
    
    # Load data
    loader = HGNNDataLoader(paths, config)
    loader.load_entity_metadata()
    edge_data = loader.load_edge_data()
    
    # Split data FIRST (before building graph)
    print("\n" + "=" * 80)
    print("STEP 1: SPLIT DATA (before graph construction)")
    print("=" * 80)
    splits = split_edges_user_based(edge_data['user_book'],
                                     config.TRAIN_RATIO,
                                     config.VAL_RATIO,
                                     config.TEST_RATIO)
    user_train, user_val, user_test = get_per_user_items(splits)
    
    # Create training-only graph edges
    print("\n" + "=" * 80)
    print("STEP 2: CREATE TRAINING-ONLY EDGES (NO LEAKAGE)")
    print("=" * 80)
    train_edges = create_train_graph_edges(splits, loader)
    print(f"\n‚úì Training user-review edges: {len(train_edges['user_review']['user']):,}")
    print(f"‚úì Training review-book edges: {len(train_edges['review_book']['review']):,}")
    
    # Build graph using ONLY training edges
    print("\n" + "=" * 80)
    print("STEP 3: BUILD GRAPH (training edges only - NO LEAKAGE)")
    print("=" * 80)
    g = loader.build_heterograph(train_edges['user_review'], train_edges['review_book'])
    
    # Extract features
    features = loader.extract_node_features(g)
    feature_dims = {k: v.shape[1] for k, v in features.items()}
    
    # Create model
    model = BookRecommendationModel(g, feature_dims, config).to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\nüìä Model: {total_params:,} parameters")
    
    # Train
    history, best_epoch, g, features, model = train(
        model, g, features, splits, user_train, user_val, user_test, config, device
    )
    
    # Plot
    plot_training_history(history, config)
    
    # Test
    test_metrics = test_evaluation(model, g, features, user_test, user_train, user_val, device, loader)
    
    # Completion
    print("\n" + "=" * 80)
    print("‚úÖ TRAINING COMPLETE!")
    print("=" * 80)
    
    return model, g, features, loader, splits, history, test_metrics

# Run main
if __name__ == "__main__":
    model, g, features, loader, splits, history, test_metrics = main()

print("\n‚úÖ All cells defined! Run this cell to execute the complete pipeline.")