In [1]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import Linear, ReLU
from torch_geometric.nn import HGTConv
from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit
from torch_geometric.utils import negative_sampling
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

dataset = 'icews18'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class HGTEncoder(torch.nn.Module):
    def __init__(self, in_channels_dict, hidden_channels, out_channels, metadata, num_heads=2):
        super().__init__()
        self.conv1 = HGTConv(in_channels_dict, hidden_channels, metadata, heads=num_heads)
        self.conv2 = HGTConv({k: hidden_channels for k in in_channels_dict}, out_channels, metadata, heads=num_heads)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {k: F.relu(v) for k, v in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

In [3]:
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=64):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x_i, x_j):
        return torch.sigmoid(self.mlp(x_i * x_j)).view(-1)

In [4]:
@torch.no_grad()
def evaluate(model, predictor, data, edge_type):
    model.eval()
    predictor.eval()

    z_dict = model(data.x_dict, data.edge_index_dict)

    pos_edge_index = data[edge_type].edge_label_index
    # neg_edge_index = data[edge_type].neg_edge_label_index
    

    neg_edge_index = negative_sampling(
        edge_index=data[edge_type].edge_index,
        num_nodes=data[edge_type[0]].num_nodes,
        num_neg_samples=data[edge_type].edge_label_index.size(1),
        method='sparse'
    )

    src_pos, dst_pos = pos_edge_index
    src_neg, dst_neg = neg_edge_index

    pos_pred = predictor(z_dict[edge_type[0]][src_pos], z_dict[edge_type[2]][dst_pos])
    neg_pred = predictor(z_dict[edge_type[0]][src_neg], z_dict[edge_type[2]][dst_neg])

    pred = torch.cat([pos_pred, neg_pred]).cpu()
    label = torch.cat([torch.ones_like(pos_pred), torch.zeros_like(neg_pred)]).cpu()

    auc = roc_auc_score(label, pred)
    ap = average_precision_score(label, pred)
    return auc, ap


def train(model, predictor, data, optimizer, edge_type):
    model.train()
    predictor.train()
    optimizer.zero_grad()

    z_dict = model(data.x_dict, data.edge_index_dict)

    pos_edge_index = data[edge_type].edge_label_index
    neg_edge_index = negative_sampling(
        edge_index=pos_edge_index,
        num_nodes=data[edge_type[0]].num_nodes,
        num_neg_samples=pos_edge_index.size(1)
    )

    src_pos, dst_pos = pos_edge_index
    src_neg, dst_neg = neg_edge_index

    pos_pred = predictor(z_dict[edge_type[0]][src_pos], z_dict[edge_type[2]][dst_pos])
    neg_pred = predictor(z_dict[edge_type[0]][src_neg], z_dict[edge_type[2]][dst_neg])

    pred = torch.cat([pos_pred, neg_pred])
    label = torch.cat([torch.ones_like(pos_pred), torch.zeros_like(neg_pred)])

    loss = F.binary_cross_entropy(pred, label)
    loss.backward()
    optimizer.step()
    return loss.item()

In [5]:
# def run_example():
#     # Build a toy hetero graph
#     data = HeteroData()
#     data['user'].x = torch.randn(100, 32)
#     data['item'].x = torch.randn(200, 32)

#     edge_index = torch.randint(0, 100, (2, 500))  # 500 user-item links
#     data['user', 'rates', 'item'].edge_index = edge_index

#     # Make it undirected and split
#     transform = ToUndirected()  # optional, depends on the task
#     data = transform(data)
#     split = RandomLinkSplit(
#         edge_types=[('user', 'rates', 'item')],
#         rev_edge_types=[('item', 'rev_rates', 'user')],
#         add_negative_train_samples=True
#     )
#     train_data, val_data, test_data = split(data)

#     metadata = train_data.metadata()
#     in_channels_dict = {k: v.size(-1) for k, v in train_data.x_dict.items()}

#     # Model
#     encoder = HGTEncoder(in_channels_dict, hidden_channels=64, out_channels=64, metadata=metadata).to('cpu')
#     predictor = LinkPredictor(in_dim=64).to('cpu')
#     optimizer = torch.optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=0.005)

#     # Training loop
#     edge_type = ('user', 'rates', 'item')
#     for epoch in range(1, 51):
#         loss = train(encoder, predictor, train_data, optimizer, edge_type)
#         if epoch % 10 == 0:
#             val_auc, val_ap = evaluate(encoder, predictor, val_data, edge_type)
#             print(f"Epoch {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, AP: {val_ap:.4f}")

#     test_auc, test_ap = evaluate(encoder, predictor, test_data, edge_type)
#     print(f"Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}")

# run_example()




In [6]:
user_artist = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_actor.csv', encoding='utf-8', names=['userID','artistID', 'weight'],)
user_friend = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_action.csv', encoding='utf-8', names=['userID', 'friendID'])
user_tag = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_sector.csv', encoding='utf-8', names=['artistID', 'tagID'])

# indices = np.arange(len(user_artist))
# train_idx, test_idx = train_test_split(indices, test_size=0.15, random_state=42)
# val_idx, test_idx = train_test_split(test_idx, test_size=0.5, random_state=42)
# train_data, val_data, test_data = create_data(user_artist, train_idx, val_idx, test_idx)

num_actor1 = user_artist['userID'].max()+1
num_actor2 = user_artist['artistID'].max()+2
num_action = user_friend['friendID'].max()+1
num_sector = user_tag['tagID'].max()+1

In [7]:
len(user_friend)

73510

In [8]:
data = HeteroData()
data['actor1'].x = torch.randn(num_actor1, 32)
data['actor2'].x = torch.randn(num_actor2, 32)
data['action'].x = torch.randn(num_action, 32)
data['sector'].x = torch.randn(num_sector, 32)

edge_index = torch.tensor(user_artist[[user_artist.columns[0], user_artist.columns[1]]].values.T, dtype=torch.long)

data['actor1', 'interacts', 'actor2'].edge_index = torch.tensor(user_artist[[user_artist.columns[0], user_artist.columns[1]]].values.T, dtype=torch.long)
data['actor1', 'involved', 'action'].edge_index = torch.tensor(user_friend[[user_friend.columns[0], user_friend.columns[1]]].values.T, dtype=torch.long)
data['actor1', 'belongs', 'sector'].edge_index = torch.tensor(user_tag[[user_tag.columns[0], user_tag.columns[1]]].values.T, dtype=torch.long)

transform = ToUndirected()  # optional, depends on the task
data = transform(data)

split = RandomLinkSplit(
    edge_types=[('actor1', 'interacts', 'actor2'),('actor1', 'involved', 'action'),('actor1', 'belongs', 'sector')],
    rev_edge_types=[('actor2', 'rev_interacts', 'actor1'),('action', 'rev_involved', 'actor1'),('sector', 'rev_belongs', 'actor1')],
    add_negative_train_samples=True
)

train_data, val_data, test_data = split(data)


In [9]:
def edge_set(data, etype):
    return set(map(tuple, data[etype].edge_label_index.t().tolist()))

etype = ('actor1', 'interacts', 'actor2')
train_edges = edge_set(train_data, etype)
val_edges = edge_set(val_data, etype)
test_edges = edge_set(test_data, etype)

print("Train ∩ Val:", len(train_edges & val_edges))
print("Train ∩ Test:", len(train_edges & test_edges))
print("Val ∩ Test:", len(val_edges & test_edges))

Train ∩ Val: 2848
Train ∩ Test: 4634
Val ∩ Test: 1704


In [10]:
def train_test():
    metadata = train_data.metadata()
    in_channels_dict = {k: v.size(-1) for k, v in train_data.x_dict.items()}

    # Model
    encoder = HGTEncoder(in_channels_dict, hidden_channels=64, out_channels=64, metadata=metadata).to('cpu')
    predictor = LinkPredictor(in_dim=64).to('cpu')
    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=0.005)

    # Training loop
    edge_type = ('actor1', 'interacts', 'actor2')
    for epoch in range(1, 51):
        loss = train(encoder, predictor, train_data, optimizer, edge_type)
        if epoch % 10 == 0:
            val_auc, val_ap = evaluate(encoder, predictor, val_data, edge_type)
            print(f"Epoch {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, AP: {val_ap:.4f}")

    test_auc, test_ap = evaluate(encoder, predictor, test_data, edge_type)
    print(f"Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}")

In [11]:
train_test()

Epoch 010, Loss: 0.6691, Val AUC: 0.6216, AP: 0.6427
Epoch 020, Loss: 0.6104, Val AUC: 0.6842, AP: 0.7378
Epoch 030, Loss: 0.5806, Val AUC: 0.6962, AP: 0.7583
Epoch 040, Loss: 0.5684, Val AUC: 0.7049, AP: 0.7669
Epoch 050, Loss: 0.5617, Val AUC: 0.7040, AP: 0.7693
Test AUC: 0.7052, Test AP: 0.7695
