In [9]:
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import SAGEConv
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import negative_sampling
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.data import Data
from utils.reorganized_preprocessing import get_edges_and_indices
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

dataset = 'icews14'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [10]:
def create_edge_index_label(x, edge_index, data):
    pos_edge_index = edge_index.clone()
    num_pos = pos_edge_index.size(1)
    pos_edge_label = torch.ones(num_pos, dtype=torch.float)

    neg_edge_index = negative_sampling(
        edge_index=edge_index,
        num_nodes=x.size(0),
        num_neg_samples=num_pos,
        method='sparse'
    )
    neg_edge_label = torch.zeros(neg_edge_index.size(1), dtype=torch.float)

    
    data.pos_edge_label_index = pos_edge_index
    data.neg_edge_label_index = neg_edge_index
    
    data.edge_label_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
    data.edge_label = torch.cat([pos_edge_label, neg_edge_label], dim=0)

def create_data(df, train_idx, val_idx, test_idx):
    head1,head2 = df.columns[:2]
    edge_index = torch.tensor(df.iloc[train_idx][[head1, head2]].values.T, dtype=torch.long)
    x = torch.randn(edge_index.max().item()+1, 512)
    train_data = Data(edge_index=edge_index, x=x)
    create_edge_index_label(x, edge_index, train_data)

    edge_index = torch.tensor(df.iloc[val_idx][[head1, head2]].values.T, dtype=torch.long)
    val_data = Data(edge_index=edge_index, x=x)
    create_edge_index_label(x, edge_index, val_data)

    edge_index = torch.tensor(df.iloc[test_idx][[head1, head2]].values.T, dtype=torch.long)
    test_data = Data(edge_index=edge_index,x=x)
    create_edge_index_label(x, edge_index, test_data)

    return train_data, val_data, test_data

def create_x(train_data, val_data, test_data):
    num_node_features = 512
    x = torch.randn(train_data.edge_index.max().item()+3, num_node_features)
    train_data.x = x
    train_data.num_node_features = num_node_features
    val_data.x = x
    val_data.num_node_features = num_node_features
    test_data.x = x
    test_data.num_node_features = num_node_features

    return train_data, val_data, test_data
    

In [11]:
user_artist = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_actor.csv', encoding='utf-8', names=['userID','artistID', 'weight'],)
user_friend = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_action.csv', encoding='utf-8', names=['userID', 'friendID'])
artist_tag = pd.read_csv(f'../../../../data/raw/{dataset}/1-indexed/actor_sector.csv', encoding='utf-8', names=['artistID', 'tagID'])

indices = np.arange(len(user_artist))
train_idx, test_idx = train_test_split(indices, test_size=0.15, random_state=42)
val_idx, test_idx = train_test_split(test_idx, test_size=0.5, random_state=42)

train_data, val_data, test_data = create_data(user_artist, train_idx, val_idx, test_idx)
num_user = user_artist['userID'].max()
num_artist = user_artist['artistID'].max()
num_tag = artist_tag['tagID'].max()

In [12]:
train_data

Data(x=[7077, 512], edge_index=[2, 58462], pos_edge_label_index=[2, 58462], neg_edge_label_index=[2, 58462], edge_label_index=[2, 116924], edge_label=[116924])

In [13]:
val_data

Data(x=[7077, 512], edge_index=[2, 5158], pos_edge_label_index=[2, 5158], neg_edge_label_index=[2, 5158], edge_label_index=[2, 10316], edge_label=[10316])

In [14]:
class GraphSAGEEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class LinkPredictor(nn.Module):
    def __init__(self, hidden_channels, dropout=0.5):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_channels, 1)
        )

    def forward(self, x_i, x_j):
        return torch.sigmoid(self.mlp(x_i * x_j)).view(-1)

In [15]:
@torch.no_grad()
def test(model, predictor, data):
    model.eval()
    predictor.eval()

    z = model(data.x, data.edge_index)

    pos_edge = data.pos_edge_label_index
    neg_edge = data.neg_edge_label_index

    edge = torch.cat([pos_edge, neg_edge], dim=1)
    x_i, x_j = z[edge[0]], z[edge[1]]
    labels = torch.cat([
        torch.ones(pos_edge.size(1)),
        torch.zeros(neg_edge.size(1))
    ]).to(device)

    pred = predictor(x_i, x_j).detach().cpu()
    labels = labels.detach().cpu()

    auc = roc_auc_score(labels, pred)
    ap = average_precision_score(labels, pred)
    return auc, ap


def train(model, predictor, train_data, optimizer):
    model.train()
    predictor.train()

    z = model(train_data.x, train_data.edge_index)

    pos_edge = train_data.pos_edge_label_index
    neg_edge = negative_sampling(
        edge_index=train_data.edge_index,
        num_nodes=train_data.num_nodes,
        num_neg_samples=pos_edge.size(1)
    )

    edge = torch.cat([pos_edge, neg_edge], dim=1)
    x_i, x_j = z[edge[0]], z[edge[1]]
    labels = torch.cat([
        torch.ones(pos_edge.size(1)),
        torch.zeros(neg_edge.size(1))
    ]).to(device)

    pred = predictor(x_i, x_j)
    loss = F.binary_cross_entropy(pred, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss.item()




In [16]:
model = GraphSAGEEncoder(train_data.num_node_features, 64).to(device)
predictor = LinkPredictor(64).to(device)
optimizer = torch.optim.Adam(
    list(model.parameters()) + list(predictor.parameters()),
    lr=0.01
)


In [17]:
for epoch in range(1, 51):
    loss = train(model, predictor, train_data.to(device), optimizer)
    if epoch % 10 == 0:
        val_auc, val_ap = test(model, predictor, val_data.to(device))
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, AP: {val_ap:.4f}')

# Final test evaluation
test_auc, test_ap = test(model, predictor, test_data.to(device))
print(f'Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}')

Epoch: 010, Loss: 0.4481, Val AUC: 0.8135, AP: 0.8577
Epoch: 020, Loss: 0.3079, Val AUC: 0.9014, AP: 0.9184
Epoch: 030, Loss: 0.2365, Val AUC: 0.9428, AP: 0.9480
Epoch: 040, Loss: 0.1782, Val AUC: 0.9481, AP: 0.9531
Epoch: 050, Loss: 0.1492, Val AUC: 0.9524, AP: 0.9565
Test AUC: 0.9534, Test AP: 0.9589
