### Testing Hard Negative sampling

In [1]:

import sys, os
sys.path.append(os.path.abspath(".."))

In [2]:
import torch
torch.__version__


'2.1.0+cpu'

In [3]:


import torch
import networkx as nx
import random

from utils import (
    PinSAGEHetero,
    train_pinsage_hetero,
    create_edge_loader,
    train_val_test_split,
    create_hetero_graph
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
hetero_data = create_hetero_graph()

In [5]:
print(hetero_data)

HeteroData(
  user={ x=[25826, 4] },
  problem={ x=[34572, 9] },
  hold={ x=[198, 198] },
  (user, rates, problem)={
    edge_index=[2, 1627580],
    edge_attr=[1627580, 3],
    edge_time=[1627580],
  },
  (problem, rev_rates, user)={
    edge_index=[2, 1627580],
    edge_attr=[1627580, 3],
    edge_time=[1627580],
  },
  (problem, contains, hold)={
    edge_index=[2, 304852],
    edge_attr=[304852, 3],
  },
  (hold, rev_contains, problem)={
    edge_index=[2, 304852],
    edge_attr=[304852, 3],
  }
)


In [6]:

message_data, train_data, val_data, test_data = train_val_test_split(
    hetero_data,
    ('user', 'rates', 'problem'),
    message_p=0.7,  # 70% of edges for message passing
    train_p=0.1,    # 10% for training supervision
    val_p=0.1,      # 10% for validation supervision
    by_user=True    # Split per user chronologically
)


In [7]:

edge_type = ('user', 'rates', 'problem')

loader = create_edge_loader(
    message_data=message_data,
    supervision_data=train_data,
    edge_type=edge_type,
    batch_size=8  # small batch for testing
)

In [8]:
batch = next(iter(loader))
print("Positive edges:\n", batch['pos_edge_index'].t())
print("Negative edges:\n", batch['neg_edge_index'].t())


Positive edges:
 tensor([[ 3021, 12549],
        [14377,   850],
        [ 1053,   927],
        [21600,   902],
        [ 1032,   901],
        [20076,   931],
        [18234,   760],
        [ 5130,   866]])
Negative edges:
 tensor([[ 3021, 32316],
        [14377, 27707],
        [ 1053, 31550],
        [21600, 26781],
        [ 1032,  7702],
        [20076, 18314],
        [18234, 32260],
        [ 5130, 15766],
        [ 3021,  5870],
        [14377,   904],
        [ 1053,   904],
        [21600,   853],
        [ 1032,   959],
        [20076,   870],
        [18234,   970],
        [ 5130,   807]])


### Check for duplicates:

In [9]:
edge_index = message_data['user', 'rates', 'problem'].edge_index
existing = set(zip(edge_index[0].tolist(), edge_index[1].tolist()))

neg_edges = [tuple(e.tolist()) for e in batch['neg_edge_index'].t()]
dupes = [e for e in neg_edges if e in existing]

print("Duplicate (invalid) negatives:", dupes)


Duplicate (invalid) negatives: []


### Check that negatives are actually close

In [10]:
# Build undirected NetworkX graph manually
from utils import hetero_to_undirected_nx
G = hetero_to_undirected_nx(message_data)

# Pick one user and its corresponding hard negative problem
user_id = batch['pos_edge_index'][0, 0].item()
neg_problem = batch['neg_edge_index'][1, 0].item()

# Personalized PageRank for this user
user_node = ('user', user_id)
pr = nx.pagerank(G, personalization={user_node: 1.0}, alpha=0.85)

# Compare PageRank score of hard negative vs random problem
neg_score = pr.get(('problem', neg_problem), 0)
rand_problem = random.randint(0, message_data['problem'].x.shape[0] - 1)
rand_score = pr.get(('problem', rand_problem), 0)

print(f"User {user_id}: hard negative PR={neg_score:.6f}, random problem PR={rand_score:.6f}")



User 3021: hard negative PR=0.000002, random problem PR=0.000001


## PinSage

In [4]:
edge_type = ('user', 'rates', 'problem')

# 1. Create heterogeneous graph
hetero_data = create_hetero_graph()

# 2. Split it using your helper
message_data, train_data, val_data, test_data = train_val_test_split(
    hetero_data,
    edge_type,
    message_p=0.7,
    train_p=0.1,
    val_p=0.1,
    by_user=True
)

print("message edges:", message_data[edge_type].edge_index.shape[1])
print("train edges:", train_data[edge_type].edge_index.shape[1])
print("val edges:", val_data[edge_type].edge_index.shape[1])
print("test edges:", test_data[edge_type].edge_index.shape[1])




message edges: 1151276
train edges: 160999
val edges: 163204
test edges: 152101


In [6]:
model = PinSAGEHetero(
    user_in=hetero_data['user'].x.size(1),
    problem_in=hetero_data['problem'].x.size(1),
    hidden_channels=128,
    out_channels=64,
    num_layers=2
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_pinsage_hetero(
    model=model,
    message_data=message_data,
    train_data=train_data,
    val_data=val_data,
    edge_type=('user', 'rates', 'problem'),
    optimizer=optimizer,
    num_epochs=10,
    device='cpu',
    batch_size=1024,
    hn_increase_rate=2,  # Increase hard negatives every 2 epochs
    max_hn=3,            # Max 3 hard negatives per positive
    ppr_start=10,
    ppr_end=100
)


Computing hard negative candidates...
Found hard negatives for 25826 users
Starting training...
[PinSAGE-Hetero] Epoch 1/10, Loss: 0.1091, Hard negatives: 0
Recall@20: 0.2115
[PinSAGE-Hetero] Epoch 2/10, Loss: 0.0777, Hard negatives: 0
Recall@20: 0.2152
[PinSAGE-Hetero] Epoch 3/10, Loss: 0.2430, Hard negatives: 1
Recall@20: 0.2207
[PinSAGE-Hetero] Epoch 4/10, Loss: 0.2249, Hard negatives: 1
Recall@20: 0.2329
[PinSAGE-Hetero] Epoch 5/10, Loss: 0.2550, Hard negatives: 2
Recall@20: 0.2295
[PinSAGE-Hetero] Epoch 6/10, Loss: 0.2491, Hard negatives: 2
Recall@20: 0.2260
[PinSAGE-Hetero] Epoch 7/10, Loss: 0.2601, Hard negatives: 3
Recall@20: 0.2178
[PinSAGE-Hetero] Epoch 8/10, Loss: 0.2555, Hard negatives: 3
Recall@20: 0.2105
[PinSAGE-Hetero] Epoch 9/10, Loss: 0.2519, Hard negatives: 3
Recall@20: 0.2128
[PinSAGE-Hetero] Epoch 10/10, Loss: 0.2480, Hard negatives: 3
Recall@20: 0.2298
