## Link Prediction using Graph Neural Networks

Graph에서는 node classification, graph classification, link prediction 등의 다양한 task가 존재합니다. 이번 튜토리얼에서는 그 중 Link Preidiction에 대해서 다룹니다. 

In [1]:
import dgl 
import dgl.function as fn 

import torch 
import torch.nn as nn 

import torch.optim as optim 
import torch.nn.functional as F 
import itertools 
import numpy as np 
import scipy.sparse as sp

from sklearn.metrics import roc_auc_score

## Overview of Link Prediction with GNN 

social recoomendation, item recommendation, knowledge graph 등 다양한 application 연구가 존재합니다. 이때, 각 node들이 연결되어 있는지 아닌지를 예측하는 것이 바로 Link Prediction 입니다. 

본 튜토리얼에서는 citation network data를 사용해서 두 논문 간의 인용 관계를 예측하고자 합니다.

## Loading graph and features 

In [2]:
import dgl.data 

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


## Prepare traning and testing sets 

본 튜토리얼에서는 edge의 10%만 추출해서 test set으로 사용하고 나머지는 training set으로 사용합니다. 

In [3]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids)*0.1)

train_size = g.number_of_edges() - test_size 
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]


# negative edge
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) # 인접행렬을 생성합니다. coo_marix(data, (row, col)) row와 col에 동시에 출현하는 index는 1로 mapping하는 것입니다.
adj_neg = 1 - adj.todense() # negative edge를 찾는 것이기 때문에 1을 빼서 adj_neg를 생성합니다.
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges()//2)
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [4]:
# `dgl.remove_edges`로 test의 edge를 삭제하고 이를 예측하는 형태로 변환한 후 학습을 진행합니다.
train_g = dgl.remove_edges(g, eids[:test_size])

## Define a GraphSAGE model

In [5]:
from dgl.nn import SAGEConv 

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()

        self.in_feats = in_feats 
        self.h_feats = h_feats 

        self.conv1 = SAGEConv(self.in_feats, self.h_feats, aggregator_type = 'mean')
        self.conv2 = SAGEConv(self.h_feats, self.h_feats, aggregator_type = 'mean')
        self.relu = nn.ReLU()

    def forward(self, g, in_feats):
        h = self.conv1(g, in_feats)
        h = self.relu(h)
        h = self.conv2(g, h)
        return h 

In [6]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())


In [57]:

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h 
            g.apply_edges(fn.u_dot_v('h', 'h', 'score')) # `h`: source node, `h` destination node
            return g.edata['score'][:, 0]

In [58]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats*2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h'], 1])
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}
    
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

## Training Loop 

In [59]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)

pred = DotPredictor()

In [66]:

def criterion(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def accuracy(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [69]:
all_logits = []

optimizer = optim.Adam(model.parameters())
for epoch in range(100):
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = criterion(pos_score, neg_score)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 5 == 0 :
        print(f'In epoch {epoch+1}, loss: {loss:.4f}')


with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', accuracy(pos_score, neg_score))

In epoch 5, loss: 0.2586
In epoch 10, loss: 0.2549
In epoch 15, loss: 0.2514
In epoch 20, loss: 0.2477
In epoch 25, loss: 0.2441
In epoch 30, loss: 0.2405
In epoch 35, loss: 0.2369
In epoch 40, loss: 0.2333
In epoch 45, loss: 0.2296
In epoch 50, loss: 0.2260
In epoch 55, loss: 0.2223
In epoch 60, loss: 0.2186
In epoch 65, loss: 0.2149
In epoch 70, loss: 0.2111
In epoch 75, loss: 0.2073
In epoch 80, loss: 0.2035
In epoch 85, loss: 0.1997
In epoch 90, loss: 0.1959
In epoch 95, loss: 0.1920
In epoch 100, loss: 0.1881
AUC 0.8853808315177107
