### Testing Hard Negative sampling

In [1]:
import torch
from torch_geometric.data import HeteroData
from torch_geometric.utils import to_networkx
import networkx as nx
import random
from torch.utils.data import DataLoader, Dataset


  import torch_geometric.typing
  import torch_geometric.typing


In [2]:
from graph_creation import create_hetero_graph

hetero_data = create_hetero_graph()


In [3]:
print(hetero_data)

HeteroData(
  user={ x=[25826, 5] },
  problem={ x=[34572, 9] },
  hold={ x=[198, 198] },
  (user, rates, problem)={
    edge_index=[2, 1627580],
    edge_attr=[1627580, 3],
    edge_time=[1627580],
  },
  (problem, rev_rates, user)={
    edge_index=[2, 1627580],
    edge_attr=[1627580, 3],
    edge_time=[1627580],
  },
  (problem, contains, hold)={
    edge_index=[2, 304852],
    edge_attr=[304852, 3],
  },
  (hold, rev_contains, problem)={
    edge_index=[2, 304852],
    edge_attr=[304852, 3],
  }
)


In [4]:
from training_utils import train_val_test_split
message_data, train_data, val_data, test_data = train_val_test_split(
    hetero_data,
    ('user', 'rates', 'problem'),
    message_p=0.7,  # 70% of edges for message passing
    train_p=0.1,    # 10% for training supervision
    val_p=0.1,      # 10% for validation supervision
    by_user=True    # Split per user chronologically
)


In [5]:
from training_utils import create_edge_loader
edge_type = ('user', 'rates', 'problem')

loader = create_edge_loader(
    message_data=message_data,
    supervision_data=train_data,
    edge_type=edge_type,
    batch_size=8  # small batch for testing
)

In [6]:
batch = next(iter(loader))
print("Positive edges:\n", batch['pos_edge_index'].t())
print("Negative edges:\n", batch['neg_edge_index'].t())


Positive edges:
 tensor([[25168,   710],
        [14819,   809],
        [ 2692,   962],
        [24639,  1100],
        [23827,   951],
        [ 6029,   808],
        [ 6252,   891],
        [ 1422,   954]])
Negative edges:
 tensor([[25168,   820],
        [14819,   367],
        [ 2692,  1069],
        [24639,  1173],
        [23827,   668],
        [ 6029,     6],
        [ 6252,  1677],
        [ 1422,  1615]])


### Check for duplicates:

In [7]:
edge_index = message_data['user', 'rates', 'problem'].edge_index
existing = set(zip(edge_index[0].tolist(), edge_index[1].tolist()))

neg_edges = [tuple(e.tolist()) for e in batch['neg_edge_index'].t()]
dupes = [e for e in neg_edges if e in existing]

print("Duplicate (invalid) negatives:", dupes)


Duplicate (invalid) negatives: []


### Check that negatives are actually close

In [9]:
# Build undirected NetworkX graph manually
from training_utils import hetero_to_undirected_nx
G = hetero_to_undirected_nx(message_data)

# Pick one user and its corresponding hard negative problem
user_id = batch['pos_edge_index'][0, 0].item()
neg_problem = batch['neg_edge_index'][1, 0].item()

# Personalized PageRank for this user
user_node = ('user', user_id)
pr = nx.pagerank(G, personalization={user_node: 1.0}, alpha=0.85)

# Compare PageRank score of hard negative vs random problem
neg_score = pr.get(('problem', neg_problem), 0)
rand_problem = random.randint(0, message_data['problem'].x.shape[0] - 1)
rand_score = pr.get(('problem', rand_problem), 0)

print(f"User {user_id}: hard negative PR={neg_score:.6f}, random problem PR={rand_score:.6f}")



User 25168: hard negative PR=0.000328, random problem PR=0.000001
