In [None]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import Linear, ReLU
from torch_geometric.nn import HGTConv
from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit
from torch_geometric.utils import negative_sampling
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

dataset = 'icews14'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class HGTEncoder(torch.nn.Module):
    def __init__(self, in_channels_dict, hidden_channels, out_channels, metadata, num_heads=2):
        super().__init__()
        self.conv1 = HGTConv(in_channels_dict, hidden_channels, metadata, heads=num_heads)
        self.conv2 = HGTConv({k: hidden_channels for k in in_channels_dict}, out_channels, metadata, heads=num_heads)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {k: F.relu(v) for k, v in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

In [3]:
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=64):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x_i, x_j):
        return torch.sigmoid(self.mlp(x_i * x_j)).view(-1)

In [4]:
@torch.no_grad()
def evaluate(model, predictor, data, edge_type):
    model.eval()
    predictor.eval()

    z_dict = model(data.x_dict, data.edge_index_dict)

    pos_edge_index = data[edge_type].edge_label_index
    # neg_edge_index = data[edge_type].neg_edge_label_index
    

    neg_edge_index = negative_sampling(
        edge_index=data[edge_type].edge_index,
        num_nodes=data[edge_type[0]].num_nodes,
        num_neg_samples=data[edge_type].edge_label_index.size(1),
        method='sparse'
    )

    src_pos, dst_pos = pos_edge_index
    src_neg, dst_neg = neg_edge_index

    pos_pred = predictor(z_dict[edge_type[0]][src_pos], z_dict[edge_type[2]][dst_pos])
    neg_pred = predictor(z_dict[edge_type[0]][src_neg], z_dict[edge_type[2]][dst_neg])

    pred = torch.cat([pos_pred, neg_pred]).cpu()
    label = torch.cat([torch.ones_like(pos_pred), torch.zeros_like(neg_pred)]).cpu()

    auc = roc_auc_score(label, pred)
    ap = average_precision_score(label, pred)
    return auc, ap


def train(model, predictor, data, optimizer, edge_type):
    model.train()
    predictor.train()
    optimizer.zero_grad()

    z_dict = model(data.x_dict, data.edge_index_dict)

    pos_edge_index = data[edge_type].edge_label_index
    neg_edge_index = negative_sampling(
        edge_index=pos_edge_index,
        num_nodes=data[edge_type[0]].num_nodes,
        num_neg_samples=pos_edge_index.size(1)
    )

    src_pos, dst_pos = pos_edge_index
    src_neg, dst_neg = neg_edge_index

    pos_pred = predictor(z_dict[edge_type[0]][src_pos], z_dict[edge_type[2]][dst_pos])
    neg_pred = predictor(z_dict[edge_type[0]][src_neg], z_dict[edge_type[2]][dst_neg])

    pred = torch.cat([pos_pred, neg_pred])
    label = torch.cat([torch.ones_like(pos_pred), torch.zeros_like(neg_pred)])

    loss = F.binary_cross_entropy(pred, label)
    loss.backward()
    optimizer.step()
    return loss.item()

In [5]:
# def run_example():
#     # Build a toy hetero graph
#     data = HeteroData()
#     data['user'].x = torch.randn(100, 32)
#     data['item'].x = torch.randn(200, 32)

#     edge_index = torch.randint(0, 100, (2, 500))  # 500 user-item links
#     data['user', 'rates', 'item'].edge_index = edge_index

#     # Make it undirected and split
#     transform = ToUndirected()  # optional, depends on the task
#     data = transform(data)
#     split = RandomLinkSplit(
#         edge_types=[('user', 'rates', 'item')],
#         rev_edge_types=[('item', 'rev_rates', 'user')],
#         add_negative_train_samples=True
#     )
#     train_data, val_data, test_data = split(data)

#     metadata = train_data.metadata()
#     in_channels_dict = {k: v.size(-1) for k, v in train_data.x_dict.items()}

#     # Model
#     encoder = HGTEncoder(in_channels_dict, hidden_channels=64, out_channels=64, metadata=metadata).to('cpu')
#     predictor = LinkPredictor(in_dim=64).to('cpu')
#     optimizer = torch.optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=0.005)

#     # Training loop
#     edge_type = ('user', 'rates', 'item')
#     for epoch in range(1, 51):
#         loss = train(encoder, predictor, train_data, optimizer, edge_type)
#         if epoch % 10 == 0:
#             val_auc, val_ap = evaluate(encoder, predictor, val_data, edge_type)
#             print(f"Epoch {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, AP: {val_ap:.4f}")

#     test_auc, test_ap = evaluate(encoder, predictor, test_data, edge_type)
#     print(f"Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}")

# run_example()




In [6]:
user_artist = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_actor.csv', encoding='utf-8', names=['userID','artistID', 'weight'],)
user_friend = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_action.csv', encoding='utf-8', names=['userID', 'friendID'])
user_tag = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_sector.csv', encoding='utf-8', names=['artistID', 'tagID'])

# indices = np.arange(len(user_artist))
# train_idx, test_idx = train_test_split(indices, test_size=0.15, random_state=42)
# val_idx, test_idx = train_test_split(test_idx, test_size=0.5, random_state=42)
# train_data, val_data, test_data = create_data(user_artist, train_idx, val_idx, test_idx)

num_actor1 = user_artist['userID'].max()+1
num_actor2 = user_artist['artistID'].max()+2
num_action = user_friend['friendID'].max()+1
num_sector = user_tag['tagID'].max()+1

In [7]:
import random
def canonicalize_edges(edge_list):
    return list(tuple(sorted(e)) for e in edge_list)

def split_train_test_edges_hetero(data, remove_fraction):
    random.seed(42)

    train_data, val_data, test_data = data

    for edge_type in train_data.edge_types:
        if 'edge_label_index' not in train_data[edge_type]:
            continue  # Skip if no label index (not involved in link prediction)

        # Extract positive edges only
        train_pos_mask = train_data[edge_type].edge_label == 1
        train_pos_edges = train_data[edge_type].edge_label_index[:, train_pos_mask]
        original_edges = canonicalize_edges(train_pos_edges.t().tolist())

        # Get test positives
        test_pos_mask = test_data[edge_type].edge_label == 1
        test_edges = set(canonicalize_edges(test_data[edge_type].edge_label_index[:, test_pos_mask].t().tolist()))
        
        # Remove overlap with test
        intersection_indices = [i for i, edge in enumerate(original_edges) if edge in test_edges]
        num_to_remove = int(len(intersection_indices) * remove_fraction)
        remove_indices = set(random.sample(intersection_indices, num_to_remove))
        filtered_edges = [edge for i, edge in enumerate(original_edges) if i not in remove_indices]

        # Repeat for val positives
        val_pos_mask = val_data[edge_type].edge_label == 1
        val_edges = set(canonicalize_edges(val_data[edge_type].edge_label_index[:, val_pos_mask].t().tolist()))
        intersection_indices = [i for i, edge in enumerate(filtered_edges) if edge in val_edges]
        num_to_remove = int(len(intersection_indices) * remove_fraction)
        remove_indices = set(random.sample(intersection_indices, num_to_remove))
        final_filtered_edges = [edge for i, edge in enumerate(filtered_edges) if i not in remove_indices]

        # Update train_data for this edge type
        all_edges = train_data[edge_type].edge_label_index
        all_labels = train_data[edge_type].edge_label

        neg_mask = all_labels == 0
        neg_edges = all_edges[:, neg_mask]

        # Make sure we have enough negatives
        num_pos = len(final_filtered_edges)
        if neg_edges.size(1) >= num_pos:
            sampled_neg_indices = random.sample(range(neg_edges.size(1)), num_pos)
            sampled_neg_edges = neg_edges[:, sampled_neg_indices]
        else:
            raise ValueError(f"Not enough negative edges ({neg_edges.size(1)}) to match positives ({num_pos})")

        final_pos_tensor = torch.tensor(final_filtered_edges).t()
        train_data[edge_type].edge_label_index = torch.cat([final_pos_tensor, neg_edges], dim=1)
        train_data[edge_type].edge_label = torch.cat([
            torch.ones(final_pos_tensor.size(1), dtype=torch.long),
            torch.zeros(sampled_neg_edges.size(1), dtype=torch.long)
        ])

    return train_data, val_data, test_data

In [8]:
data = HeteroData()
data['actor1'].x = torch.randn(num_actor1, 32)
data['actor2'].x = torch.randn(num_actor2, 32)
data['action'].x = torch.randn(num_action, 32)
data['sector'].x = torch.randn(num_sector, 32)

edge_index = torch.tensor(user_artist[[user_artist.columns[0], user_artist.columns[1]]].values.T, dtype=torch.long)

data['actor1', 'interacts', 'actor2'].edge_index = torch.tensor(user_artist[[user_artist.columns[0], user_artist.columns[1]]].values.T, dtype=torch.long)
data['actor1', 'involved', 'action'].edge_index = torch.tensor(user_friend[[user_friend.columns[0], user_friend.columns[1]]].values.T, dtype=torch.long)
data['actor1', 'belongs', 'sector'].edge_index = torch.tensor(user_tag[[user_tag.columns[0], user_tag.columns[1]]].values.T, dtype=torch.long)

transform = ToUndirected()  # optional, depends on the task
data = transform(data)

# split = RandomLinkSplit(
#     edge_types=[('actor1', 'interacts', 'actor2'),('actor1', 'involved', 'action'),('actor1', 'belongs', 'sector')],
#     rev_edge_types=[('actor2', 'rev_interacts', 'actor1'),('action', 'rev_involved', 'actor1'),('sector', 'rev_belongs', 'actor1')],
#     add_negative_train_samples=True
# )

# train_data, val_data, test_data = split(data)

edge_types = [('actor1', 'interacts', 'actor2'),('actor1', 'involved', 'action'),('actor1', 'belongs', 'sector')]
rev_edge_types = [('actor2', 'rev_interacts', 'actor1'),('action', 'rev_involved', 'actor1'),('sector', 'rev_belongs', 'actor1')]
t,v,te = RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        add_negative_train_samples=True,
        edge_types=edge_types,
        rev_edge_types=rev_edge_types
    )(data)
train_data, val_data, test_data = split_train_test_edges_hetero([t,v,te], 1.0)

In [9]:
(te[('actor1', 'involved', 'action')].edge_label == 0).nonzero().size()

torch.Size([6877, 1])

In [10]:
train_data

HeteroData(
  actor1={ x=[7077, 32] },
  actor2={ x=[7077, 32] },
  action={ x=[200, 32] },
  sector={ x=[2227, 32] },
  (actor1, interacts, actor2)={
    edge_index=[2, 55025],
    edge_label=[60266],
    edge_label_index=[2, 85158],
  },
  (actor1, involved, action)={
    edge_index=[2, 55025],
    edge_label=[42118],
    edge_label_index=[2, 76084],
  },
  (actor1, belongs, sector)={
    edge_index=[2, 55025],
    edge_label=[18140],
    edge_label_index=[2, 64095],
  },
  (actor2, rev_interacts, actor1)={ edge_index=[2, 55025] },
  (action, rev_involved, actor1)={ edge_index=[2, 55025] },
  (sector, rev_belongs, actor1)={ edge_index=[2, 55025] }
)

In [11]:
def edge_set(data, etype):
    return set(map(tuple, data[etype].edge_label_index.t().tolist()))

etype = ('actor1', 'interacts', 'actor2')
train_edges = edge_set(train_data, etype)
val_edges = edge_set(val_data, etype)
test_edges = edge_set(test_data, etype)

print("Train ∩ Val:", len(train_edges & val_edges))
print("Train ∩ Test:", len(train_edges & test_edges))
print("Val ∩ Test:", len(val_edges & test_edges))

Train ∩ Val: 1
Train ∩ Test: 4
Val ∩ Test: 1057


In [12]:
def train_test():
    metadata = train_data.metadata()
    in_channels_dict = {k: v.size(-1) for k, v in train_data.x_dict.items()}

    # Model
    encoder = HGTEncoder(in_channels_dict, hidden_channels=64, out_channels=64, metadata=metadata).to('cpu')
    predictor = LinkPredictor(in_dim=64).to('cpu')
    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=0.005)

    # Training loop
    edge_type = ('actor1', 'interacts', 'actor2')
    for epoch in range(1, 51):
        loss = train(encoder, predictor, train_data, optimizer, edge_type)
        if epoch % 10 == 0:
            val_auc, val_ap = evaluate(encoder, predictor, val_data, edge_type)
            print(f"Epoch {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, AP: {val_ap:.4f}")

    test_auc, test_ap = evaluate(encoder, predictor, test_data, edge_type)
    print(f"Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}")

In [13]:
train_test()

Epoch 010, Loss: 0.6917, Val AUC: 0.5885, AP: 0.5991
Epoch 020, Loss: 0.6712, Val AUC: 0.6443, AP: 0.6932
Epoch 030, Loss: 0.6562, Val AUC: 0.6664, AP: 0.7196
Epoch 040, Loss: 0.6489, Val AUC: 0.6723, AP: 0.7260
Epoch 050, Loss: 0.6455, Val AUC: 0.6743, AP: 0.7289
Test AUC: 0.6746, Test AP: 0.7294
