In [1]:
""""
Implements the link prediction task on the FB15k237 datasets according to the
`"Modeling Relational Data with Graph Convolutional Networks"
<https://arxiv.org/abs/1703.06103>`_ paper.

Caution: This script is executed in a full-batch fashion, and therefore needs
to run on CPU (following the experimental setup in the official paper).
"""
import os.path as osp
import time

import torch
import torch.nn.functional as F
from torch.nn import Parameter
from tqdm import tqdm

from torch_geometric.datasets import RelLinkPredDataset
from torch_geometric.nn import GAE, RGCNConv

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch_geometric.utils import negative_sampling
from torch_geometric.data import Data
from sklearn.metrics import roc_auc_score, average_precision_score
from utils.reorganized_preprocessing import get_edges_and_indices


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

path = osp.join(osp.dirname('data/RLPD'))
dataset = RelLinkPredDataset(path, 'FB15k-237')
data = dataset[0].to(device)

dataset = 'icews14'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data

Data(edge_index=[2, 544230], num_nodes=14541, edge_type=[544230], train_edge_index=[2, 272115], train_edge_type=[272115], valid_edge_index=[2, 17535], valid_edge_type=[17535], test_edge_index=[2, 20466], test_edge_type=[20466])

Transform the icews data so that it emulates the RealLinkPredDataset

In [3]:
user_artist = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_actor.csv', encoding='utf-8', names=['userID','artistID', 'weight'],)
user_friend = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_action.csv', encoding='utf-8', names=['userID', 'friendID'])
artist_tag = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_sector.csv', encoding='utf-8', names=['artistID', 'tagID'])

For heterograph

In [4]:
user_artist = user_artist[['userID','artistID']]

user_artist['artistID'] += user_artist['userID'].max()
user_friend['friendID'] += user_artist['artistID'].max()
artist_tag['artistID'] += user_artist['userID'].max()
artist_tag['tagID'] += user_friend['friendID'].max()

In [5]:
def df_to_edge_tensors(df, rel_id):
    edge_index = torch.tensor(df.values.T, dtype=torch.long)  # shape [2, num_edges]
    edge_type = torch.full((edge_index.size(1),), rel_id, dtype=torch.long)
    return edge_index, edge_type

In [6]:
RELATION_DICT = {
'actor_actor': 0,
'actor_action': 1,
'actor_sector': 2
}

edge_index_1, edge_type_1 = df_to_edge_tensors(user_artist, RELATION_DICT['actor_actor'])
edge_index_2, edge_type_2 = df_to_edge_tensors(user_friend, RELATION_DICT['actor_action'])
edge_index_3, edge_type_3 = df_to_edge_tensors(artist_tag, RELATION_DICT['actor_sector'])

# Concatenate all edge indices and types
edge_index = torch.cat([edge_index_1, edge_index_2, edge_index_3], dim=1)
edge_type = torch.cat([edge_type_1, edge_type_2, edge_type_3], dim=0)

# Infer number of nodes
num_nodes = edge_index.max().item() + 1

data = Data(
    edge_index=edge_index,
    edge_type=edge_type,
    num_nodes=num_nodes
)

edge_array = edge_index.cpu().numpy().T  # Ensure it's on CPU before converting

df_edges = pd.DataFrame(edge_array, columns=["head1", "head2"])
train_data, val_data, test_data, train_idx, val_idx, test_idx = get_edges_and_indices(user_artist, remove_fraction=1.0)

data.train_edge_index = edge_index[:, train_idx]
data.train_edge_type = edge_type[train_idx]

data.valid_edge_index = edge_index[:, val_idx]
data.valid_edge_type = edge_type[val_idx]

data.test_edge_index = edge_index[:, test_idx]
data.test_edge_type = edge_type[test_idx]




In [7]:
data

Data(edge_index=[2, 206337], edge_type=[206337], num_nodes=16577, train_edge_index=[2, 37472], train_edge_type=[37472], valid_edge_index=[2, 29033], valid_edge_type=[29033], test_edge_index=[2, 28994], test_edge_type=[28994])

In [8]:
class RGCNEncoder(torch.nn.Module):
    def __init__(self, num_nodes, hidden_channels, num_relations):
        super().__init__()
        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
        self.conv1 = RGCNConv(hidden_channels, hidden_channels, num_relations,
                              num_blocks=5)
        self.conv2 = RGCNConv(hidden_channels, hidden_channels, num_relations,
                              num_blocks=5)
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.node_emb)
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, edge_index, edge_type):
        x = self.node_emb
        x = self.conv1(x, edge_index, edge_type).relu_()
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv2(x, edge_index, edge_type)
        return x


class DistMultDecoder(torch.nn.Module):
    def __init__(self, num_relations, hidden_channels):
        super().__init__()
        self.rel_emb = Parameter(torch.empty(num_relations, hidden_channels))
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.rel_emb)

    def forward(self, z, edge_index, edge_type):
        z_src, z_dst = z[edge_index[0]], z[edge_index[1]]
        rel = self.rel_emb[edge_type]
        return torch.sum(z_src * rel * z_dst, dim=1)

In [9]:
model = GAE(
    RGCNEncoder(data.num_nodes, 500, len(RELATION_DICT)*2),
    DistMultDecoder(len(RELATION_DICT), 500),
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def negative_sampling(edge_index, num_nodes):
    # Sample edges by corrupting either the subject or the object of each edge.
    mask_1 = torch.rand(edge_index.size(1)) < 0.5
    mask_2 = ~mask_1

    neg_edge_index = edge_index.clone()
    neg_edge_index[0, mask_1] = torch.randint(num_nodes, (mask_1.sum(), ),
                                              device=neg_edge_index.device)
    neg_edge_index[1, mask_2] = torch.randint(num_nodes, (mask_2.sum(), ),
                                              device=neg_edge_index.device)
    return neg_edge_index


In [10]:
@torch.no_grad()
def eval_link_predictor(data_edge_, data):

    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()

    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy()), average_precision_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

@torch.no_grad()
def compute_rank(ranks):
    # fair ranking prediction as the average
    # of optimistic and pessimistic ranking
    true = ranks[0]
    optimistic = (ranks > true).sum() + 1
    pessimistic = (ranks >= true).sum()
    return (optimistic + pessimistic).float() * 0.5


@torch.no_grad()
def compute_mrr(z, edge_index, edge_type):
    ranks = []
    for i in tqdm(range(edge_type.numel())):
        (src, dst), rel = edge_index[:, i], edge_type[i]

        # Try all nodes as tails, but delete true triplets:
        tail_mask = torch.ones(data.num_nodes, dtype=torch.bool)
        for (heads, tails), types in [
            (data.train_edge_index, data.train_edge_type),
            (data.valid_edge_index, data.valid_edge_type),
            (data.test_edge_index, data.test_edge_type),
        ]:
            tail_mask[tails[(heads == src) & (types == rel)]] = False

        tail = torch.arange(data.num_nodes)[tail_mask]
        tail = torch.cat([torch.tensor([dst]), tail])
        head = torch.full_like(tail, fill_value=src)
        eval_edge_index = torch.stack([head, tail], dim=0)
        eval_edge_type = torch.full_like(tail, fill_value=rel)

        out = model.decode(z, eval_edge_index, eval_edge_type)
        rank = compute_rank(out)
        ranks.append(rank)

        # Try all nodes as heads, but delete true triplets:
        head_mask = torch.ones(data.num_nodes, dtype=torch.bool)
        for (heads, tails), types in [
            (data.train_edge_index, data.train_edge_type),
            (data.valid_edge_index, data.valid_edge_type),
            (data.test_edge_index, data.test_edge_type),
        ]:
            head_mask[heads[(tails == dst) & (types == rel)]] = False

        head = torch.arange(data.num_nodes)[head_mask]
        head = torch.cat([torch.tensor([src]), head])
        tail = torch.full_like(head, fill_value=dst)
        eval_edge_index = torch.stack([head, tail], dim=0)
        eval_edge_type = torch.full_like(head, fill_value=rel)

        out = model.decode(z, eval_edge_index, eval_edge_type)
        rank = compute_rank(out)
        ranks.append(rank)

    return (1. / torch.tensor(ranks, dtype=torch.float)).mean()


In [11]:
def train():
    model.train()
    optimizer.zero_grad()

    z = model.encode(data.edge_index, data.edge_type)

    pos_out = model.decode(z, data.train_edge_index, data.train_edge_type)

    neg_edge_index = negative_sampling(data.train_edge_index, data.num_nodes)
    neg_out = model.decode(z, neg_edge_index, data.train_edge_type)

    out = torch.cat([pos_out, neg_out])
    gt = torch.cat([torch.ones_like(pos_out), torch.zeros_like(neg_out)])
    cross_entropy_loss = F.binary_cross_entropy_with_logits(out, gt)
    reg_loss = z.pow(2).mean() + model.decoder.rel_emb.pow(2).mean()
    loss = cross_entropy_loss + 1e-2 * reg_loss

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
    optimizer.step()


    pred = out.sigmoid().detach().cpu().numpy()
    labels = gt.detach().cpu().numpy()
    auc = roc_auc_score(labels, pred)
    ap = average_precision_score(labels, pred)


    return float(loss), auc, ap


@torch.no_grad()
def test():
    model.eval()
    z = model.encode(data.edge_index, data.edge_type)

    valid_mrr = compute_mrr(z, data.valid_edge_index, data.valid_edge_type)
    test_mrr = compute_mrr(z, data.test_edge_index, data.test_edge_type)

    return valid_mrr, test_mrr


@torch.no_grad()
def test_auc_ap():
    model.eval()
    z = model.encode(data.edge_index, data.edge_type)

    pos_out = model.decode(z, data.test_edge_index, data.test_edge_type)

    neg_edge_index = negative_sampling(data.test_edge_index, data.num_nodes)
    neg_out = model.decode(z, neg_edge_index, data.test_edge_type)

    out = torch.cat([pos_out, neg_out])
    gt = torch.cat([torch.ones_like(pos_out), torch.zeros_like(neg_out)])

    pred = out.sigmoid().detach().cpu().numpy()
    labels = gt.detach().cpu().numpy()
    auc = roc_auc_score(labels, pred)
    ap = average_precision_score(labels, pred)

    print(f'Test evaluation: AUC : {auc:.6f}, AP : {ap:.6f}')
    return auc, ap

In [12]:
model.to('cuda')
data.to('cuda')

times = []
for epoch in range(1, 51):
    start = time.time()
    loss, auc, ap = train()
    print(f'Epoch: {epoch:05d}, Loss: {loss:.4f}, AUC : {auc:.6f}, AP : {ap:.6f}')
    if (epoch % 100) == 0:
        valid_mrr, test_mrr = test()
        print(f'Val MRR: {valid_mrr:.4f}, Test MRR: {test_mrr:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch: 00001, Loss: 0.6932, AUC : 0.504459, AP : 0.484171
Epoch: 00002, Loss: 0.6925, AUC : 0.656592, AP : 0.577853
Epoch: 00003, Loss: 0.6810, AUC : 0.839945, AP : 0.842165
Epoch: 00004, Loss: 0.5951, AUC : 0.841805, AP : 0.829332
Epoch: 00005, Loss: 0.5451, AUC : 0.895407, AP : 0.876474
Epoch: 00006, Loss: 0.4002, AUC : 0.900100, AP : 0.879686
Epoch: 00007, Loss: 0.4142, AUC : 0.899795, AP : 0.872289
Epoch: 00008, Loss: 0.4609, AUC : 0.895369, AP : 0.857722
Epoch: 00009, Loss: 0.3837, AUC : 0.906366, AP : 0.873253
Epoch: 00010, Loss: 0.4040, AUC : 0.906375, AP : 0.872243
Epoch: 00011, Loss: 0.3724, AUC : 0.906335, AP : 0.871334
Epoch: 00012, Loss: 0.3811, AUC : 0.906476, AP : 0.873070
Epoch: 00013, Loss: 0.3462, AUC : 0.915383, AP : 0.885476
Epoch: 00014, Loss: 0.3812, AUC : 0.908394, AP : 0.874472
Epoch: 00015, Loss: 0.3401, AUC : 0.921838, AP : 0.893324
Epoch: 00016, Loss: 0.3509, AUC : 0.922443, AP : 0.895472
Epoch: 00017, Loss: 0.3292, AUC : 0.925171, AP : 0.897176
Epoch: 00018, 

In [13]:
test_auc_ap()

Test evaluation: AUC : 0.961456, AP : 0.954352


(0.961455667609898, 0.9543523083560672)