In [1]:
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.utils import negative_sampling
import torch
from torch_geometric.nn.conv import GCNConv
from torch_geometric.data import Data
import torch_geometric.transforms as T

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from utils.reorganized_preprocessing import get_edges_and_indices


dataset = 'icews14'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(
            dim=-1
        )  # product of a pair of nodes on each edge

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()
    

In [15]:
@torch.no_grad()
def eval_link_predictor(model, data):
    model.eval()
    edge_label_index = torch.cat(
        [data.pos_edge_label_index, data.neg_edge_label_index],
        dim=-1,
    )
    
    edge_label = torch.cat([
        data.pos_edge_label,
        data.neg_edge_label
    ], dim=0)
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, edge_label_index).view(-1).sigmoid()

    return roc_auc_score(edge_label.cpu().numpy(), out.cpu().numpy()), average_precision_score(edge_label.cpu().numpy(), out.cpu().numpy())

In [4]:
def train_link_predictor(
    model, train_data, val_data, optimizer, criterion, n_epochs=50
):
    for epoch in range(1, n_epochs + 1):

        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        # sampling training negatives for every training epoch
        # neg_edge_index = negative_sampling(
        #     edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        #     num_neg_samples=train_data.edge_label_index.size(1), method='sparse')
    
        edge_label_index = torch.cat(
            [train_data.pos_edge_label_index, train_data.neg_edge_label_index],
            dim=-1,
        )
        
        edge_label = torch.cat([
            train_data.pos_edge_label,
            train_data.neg_edge_label
        ], dim=0)

        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        val_auc, val_ap = eval_link_predictor(model, val_data)

        if epoch % 10 == 0:
            print(f"Epoch: {epoch:06d}, Train Loss: {loss:.6f}, Val AUC: {val_auc:.6f}, Val AP: {val_ap:.6f}")

    return model

In [5]:
def create_edge_index_label(x, edge_index, data):
    pos_edge_index = edge_index.clone()
    num_pos = pos_edge_index.size(1)
    pos_edge_label = torch.ones(num_pos, dtype=torch.float)

    neg_edge_index = negative_sampling(
        edge_index=edge_index,
        num_nodes=x.size(0),
        num_neg_samples=num_pos,
        method='sparse'
    )
    neg_edge_label = torch.zeros(neg_edge_index.size(1), dtype=torch.float)

    data.edge_label_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
    data.edge_label = torch.cat([pos_edge_label, neg_edge_label], dim=0)


def create_data(df, train_idx, val_idx, test_idx):
    head1,head2 = df.columns[:2]
    edge_index = torch.tensor(df.iloc[train_idx][[head1, head2]].values.T, dtype=torch.long)
    x = torch.randn(edge_index.max().item()+1, 512)
    train_data = Data(edge_index=edge_index, x=x)
    create_edge_index_label(x, edge_index, train_data)

    edge_index = torch.tensor(df.iloc[val_idx][[head1, head2]].values.T, dtype=torch.long)
    val_data = Data(edge_index=edge_index, x=x)
    create_edge_index_label(x, edge_index, val_data)

    edge_index = torch.tensor(df.iloc[test_idx][[head1, head2]].values.T, dtype=torch.long)
    test_data = Data(edge_index=edge_index,x=x)
    create_edge_index_label(x, edge_index, test_data)

    return train_data, val_data, test_data


def create_x(train_data, val_data, test_data):
    x = torch.randn(train_data.edge_index.max().item()+3, 512)
    train_data.x = x
    val_data.x = x
    test_data.x = x

    return train_data, val_data, test_data
    

In [6]:
user_artist = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_actor.csv', encoding='utf-8', names=['userID','artistID', 'weight'],)
user_friend = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_action.csv', encoding='utf-8', names=['userID', 'friendID'])
artist_tag = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_sector.csv', encoding='utf-8', names=['artistID', 'tagID'])

num_user = user_artist['userID'].max()
num_artist = user_artist['artistID'].max()
num_tag = artist_tag['tagID'].max()

# indices = np.arange(len(user_artist))
# train_idx, test_idx = train_test_split(indices, test_size=0.15, random_state=42)
# val_idx, test_idx = train_test_split(test_idx, test_size=0.5, random_state=42)

# train_data, val_data, test_data = create_data(user_artist, train_idx, val_idx, test_idx)


train_data, val_data, test_data, train_idx, val_idx, test_idx = get_edges_and_indices(user_artist, remove_fraction=1.0)
train_data, val_data, test_data = create_x(train_data, val_data, test_data)




In [7]:
train_data

Data(edge_index=[2, 51024], pos_edge_label=[16184], pos_edge_label_index=[2, 16184], neg_edge_label=[25512], neg_edge_label_index=[2, 25512], x=[7078, 512])

In [12]:
val_data

Data(edge_index=[2, 51024], pos_edge_label=[3189], pos_edge_label_index=[2, 3189], neg_edge_label=[3189], neg_edge_label_index=[2, 3189], x=[7078, 512])

In [9]:
edge_label_index = torch.cat(
            [train_data.pos_edge_label_index, train_data.neg_edge_label_index],
            dim=-1,
        )

In [10]:
edge_label = torch.cat([
            train_data.pos_edge_label,
            train_data.neg_edge_label
        ], dim=0)

In [12]:
####   If you want to split train edges from val and test edges


# head1,head2 = user_artist.columns[:2]
# edge_index = torch.tensor(user_artist.iloc[train_idx][[head1, head2]].values.T, dtype=torch.long)

# edge_index

# x = torch.randn(edge_index.max().item(), 512)
# graph = Data(edge_index=edge_index, x=x)
# split = T.RandomLinkSplit(
#     num_val=0.05,
#     num_test=0.1,
#     is_undirected=True,
#     add_negative_train_samples=False,
#     neg_sampling_ratio=1.0,
# )

# train_data, val_data, test_data = split(graph)

In [16]:
model = Net(512, 256, 128).to('cuda')
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

model = train_link_predictor(model, train_data.to('cuda'), val_data.to('cuda'), optimizer, criterion)

Epoch: 000010, Train Loss: 0.736446, Val AUC: 0.657779, Val AP: 0.618671
Epoch: 000020, Train Loss: 0.573247, Val AUC: 0.734911, Val AP: 0.707837
Epoch: 000030, Train Loss: 0.403932, Val AUC: 0.797983, Val AP: 0.805176
Epoch: 000040, Train Loss: 0.280593, Val AUC: 0.807113, Val AP: 0.822304
Epoch: 000050, Train Loss: 0.203862, Val AUC: 0.810769, Val AP: 0.822999


In [17]:
test_auc, test_ap = eval_link_predictor(model, test_data.to('cuda'))

print(f"Test: AUC : {test_auc:.6f}, AP : {test_ap:.6f}")

Test: AUC : 0.831602, AP : 0.841418
