### Testing Hard Negative sampling

In [None]:
import torch
from torch_geometric.data import HeteroData
from torch_geometric.utils import to_networkx
import networkx as nx
import random
from torch.utils.data import DataLoader, Dataset 
from collections import Counter

from utils import (
    PinSAGEHetero,
    train_pinsage_hetero,
    train_val_test_split_homogeneous,
    create_edge_loader,
    train_val_test_split,
    create_hetero_graph
)

  import torch_geometric.typing
  import torch_geometric.typing


In [None]:


hetero_data = create_hetero_graph()


In [3]:
print(hetero_data)

HeteroData(
  user={ x=[25826, 5] },
  problem={ x=[34572, 9] },
  hold={ x=[198, 198] },
  (user, rates, problem)={
    edge_index=[2, 1627580],
    edge_attr=[1627580, 3],
    edge_time=[1627580],
  },
  (problem, rev_rates, user)={
    edge_index=[2, 1627580],
    edge_attr=[1627580, 3],
    edge_time=[1627580],
  },
  (problem, contains, hold)={
    edge_index=[2, 304852],
    edge_attr=[304852, 3],
  },
  (hold, rev_contains, problem)={
    edge_index=[2, 304852],
    edge_attr=[304852, 3],
  }
)


In [4]:

message_data, train_data, val_data, test_data = train_val_test_split(
    hetero_data,
    ('user', 'rates', 'problem'),
    message_p=0.7,  # 70% of edges for message passing
    train_p=0.1,    # 10% for training supervision
    val_p=0.1,      # 10% for validation supervision
    by_user=True    # Split per user chronologically
)


In [5]:

edge_type = ('user', 'rates', 'problem')

loader = create_edge_loader(
    message_data=message_data,
    supervision_data=train_data,
    edge_type=edge_type,
    batch_size=8  # small batch for testing
)

In [6]:
batch = next(iter(loader))
print("Positive edges:\n", batch['pos_edge_index'].t())
print("Negative edges:\n", batch['neg_edge_index'].t())


Positive edges:
 tensor([[11745,   961],
        [16931,   870],
        [17848,   789],
        [24598,   828],
        [15622,   711],
        [20850,  1357],
        [22109,   970],
        [ 6252,   837]])
Negative edges:
 tensor([[11745,  1355],
        [16931,   689],
        [17848,   777],
        [24598,   800],
        [15622,   450],
        [20850,   963],
        [22109,  1135],
        [ 6252,   972]])


### Check for duplicates:

In [7]:
edge_index = message_data['user', 'rates', 'problem'].edge_index
existing = set(zip(edge_index[0].tolist(), edge_index[1].tolist()))

neg_edges = [tuple(e.tolist()) for e in batch['neg_edge_index'].t()]
dupes = [e for e in neg_edges if e in existing]

print("Duplicate (invalid) negatives:", dupes)


Duplicate (invalid) negatives: []


### Check that negatives are actually close

In [None]:
# Build undirected NetworkX graph manually
from utils import hetero_to_undirected_nx
G = hetero_to_undirected_nx(message_data)

# Pick one user and its corresponding hard negative problem
user_id = batch['pos_edge_index'][0, 0].item()
neg_problem = batch['neg_edge_index'][1, 0].item()

# Personalized PageRank for this user
user_node = ('user', user_id)
pr = nx.pagerank(G, personalization={user_node: 1.0}, alpha=0.85)

# Compare PageRank score of hard negative vs random problem
neg_score = pr.get(('problem', neg_problem), 0)
rand_problem = random.randint(0, message_data['problem'].x.shape[0] - 1)
rand_score = pr.get(('problem', rand_problem), 0)

print(f"User {user_id}: hard negative PR={neg_score:.6f}, random problem PR={rand_score:.6f}")



User 5250: hard negative PR=0.000249, random problem PR=0.000000


## PinSage

In [2]:
edge_type = ('user', 'rates', 'problem')

# 1. Create heterogeneous graph
hetero_data = create_hetero_graph()

# 2. Split it using your helper
message_data, train_data, val_data, test_data = train_val_test_split(
    hetero_data,
    edge_type,
    message_p=0.7,
    train_p=0.1,
    val_p=0.1,
    by_user=True
)

print("message edges:", message_data[edge_type].edge_index.shape[1])
print("train edges:", train_data[edge_type].edge_index.shape[1])
print("val edges:", val_data[edge_type].edge_index.shape[1])
print("test edges:", test_data[edge_type].edge_index.shape[1])




message edges: 1151276
train edges: 160999
val edges: 163204
test edges: 152101


In [3]:
x_dict = {
    'user': message_data['user'].x,
    'problem': message_data['problem'].x
}

edge_index_dict = {
    ('user', 'rates', 'problem'): message_data['user', 'rates', 'problem'].edge_index,
    ('problem', 'rev_rates', 'user'): message_data['problem', 'rev_rates', 'user'].edge_index
}


In [4]:
model = PinSAGEHetero(
    user_in=hetero_data['user'].x.size(1),
    problem_in=hetero_data['problem'].x.size(1),
    hidden_channels=128,
    out_channels=64,
    num_layers=2
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_pinsage_hetero(
    model,
    message_data=message_data,
    train_data=train_data,
    edge_type=edge_type,
    optimizer=optimizer,
    num_epochs=5,
    device='cpu'
)


[PinSAGE-Hetero] Epoch 1/5, Loss: 3.7418
[PinSAGE-Hetero] Epoch 2/5, Loss: 2.0725
[PinSAGE-Hetero] Epoch 3/5, Loss: 2.7586
[PinSAGE-Hetero] Epoch 4/5, Loss: 2.4530
[PinSAGE-Hetero] Epoch 5/5, Loss: 2.2085
