## Stochastic Training of GNN for Link Prediction on Large Graphs 

본 튜토리얼은 KDD20 자료를 참고하여 작성했습니다. 어떻게 GraphSAGE를 학습하는지에 대한 내용을 다룹니다. 

## Link Prediction Overview 

Link Prediction을 수행할 때 우리는 Link을 $s_{uv} = \phi ( h^{(l)}_u, h^{(l)}_v) $ 로 표현합니다. 이때 $s_{uv}$는 두 노드가 연결되어 있을 확률을 의미합니다. 

이때 negative sampling을 사용해 실제 연결된 값과 연결되지 않은 값을 비교해서 값을 산출하며 손실함수는 아래와 같이 정의합니다.

$$ \mathcal{L} = -\log \sigma (s_{uv}) - Q \mathbb{E}_{v^- \in P^- (v)} [ \sigma ( - s_{uv^-})] $$


## Load Dataset

In [None]:
import tqdm
import dgl 
import torch 
import torch.nn as nn 
import torch.optim as optim 
import numpy as np 
import utils_KDD
import pickle

with open('../data.pkl', 'rb') as f:
    data = pickle.load(f)

graph, node_features, node_labels, train_nids, valid_nids, test_nids = data 
graph.create_formats_()

DGL 패키지는 edge classification, link prediction task를 수행하기 위한 `EdgeDataLoader`를 제공합니다.

edge prediction을 수행하기 위해서는 먼저 negative sampling을 사용하여야 하기 때문에 해당 함수를 먼저 정의합니다. 

In [1]:
class NegativeSampler(object):
    def __init__(self, g, k):
        self.k = k 
        self.weights = g.in_degrees().float() ** 0.75 
    
    def __call__(self, g, eids):
        src, _ = g.find_edges(eids)
        src = src.pepeat_interleave(self.k)
        dst =self.weights.multinomial(len(src), replacement = True)
        return src, dst 

In [None]:
sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4, 4])
k = 5 
train_dataloader = dgl.dataloading.EdgeDataLoader(
    graph, torch.arange(graph.number_of_edges()), sampler, 
    negative_sampler = NegativeSampler(graph, k),
    batch_size = 1024, 
    shuffle = True, 
    drop_last = False, 
    num_workers = 4
)

In [None]:
example_minibatch = next(iter(train_dataloader))
print(example_minibatch)

In [None]:
input_nodes, pos_graph, neg_graph, bipartites = example_minibatch 
print('Number of input nodes:', len(input_nodes))
print('Positive graph # nodes:', pos_graph.number_of_nodes(), '# edges:', pos_graph.number_of_edges())
print('Negative graph # nodes:', neg_graph.number_of_nodes(), '# edges:', neg_graph.number_of_edges())
print(bipartites)

## Defining Model for Node Representation

In [None]:
import torch.nn as nn 
import torch.nn.functional as F 
import dgl.nn as dglnn

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, n_hidden, n_layers):
        super(GraphSAGE, self).__init__()
        self.in_feats = in_feats 
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
        for i in range(1, self.n_layers):
            self.layers.append(n_hidden, n_hidden, 'mean')
        
    def forward(self, bipartites, x):
        for l, (layer, bipartite) in enumerate(zip(self.layers, bipartites)):
            x = layer(bipartites, x)
            if l != self.n_layers - 1:
                x = F.relu(x)
            
        return x 

## Obtaining Node Representation from GNN 

In [None]:
def inference(model, graph, in_feats, batch_size):
    nodes = torch.arange(graph.number_of_nodes())

    sampler = dgl.dataloading.MultiLayerNeighborSampler([None])
    dataloader = dgl.dataloading.NodeDataLoader(
        graph, nodes, sampler, 
        batch_size = batch_size, 
        shuffle = False, 
        drop_last = False, 
        num_workers=0
    )

    with torch.no_grad():
        for l, layer in enumerate(model.layers):
            output_features = torch.zeros(graph.number_of_nodes(), model.n_hidden)
            for input_nodes, output_nodes, bipartites in tqdm.tqdm(dataloader):
                bipartite = bipartites[0].to('cuda')
                x = in_feats[input_nodes].to('cuda')

                x = layer(bipartite, x)
                if l != model.n_layers - 1:
                    x = F.relu(x)
                
                output_features[output_nodes] = x.cpu()
            in_feats = output_features 
    return output_features 

In [None]:
class ScorePredictor(nn.Module):
    def forward(self, subgraph, x):
        with subgraph.local_scope():
            subgraph.ndata['x'] = x 
            subgraph.apply_edges(dgl.function.u_dot_v('x', 'x', 'score'))
            return subgraph.edata['score']


## Evaluate Performance of the Learned Embedding

GraphGAGE 논문에서는 LSTM, Linear 등 다양한 classifier를 사용해서 classification을 수행하지만, 본 튜토리얼에서는 Linear Classifier를 사용해서 분류를 진행하고 있습니다.

In [None]:
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

def evaluate(emb, label, train_nids, valid_nids, test_nids):
    classifier = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', verbose=1, max_iter =1000)
    classifier.fit(emb[train_nids], label[train_nids])
    valid_pred = classifier.predict(emb[valid_nids])
    test_pred = classifier.predict(emb[test_nids])
    valid_acc = metrics.accuracy_score(label[valid_nids], valid_pred)
    test_acc = metrics.accuracy_score(label[test_nids], test_pred)
    return valid_acc, test_acc 

## Defining Training Loop 

In [None]:
model = GraphSAGE(node_features.shape[1], 128, 3).to('cuda')
predictor = ScorePredictor().to('cuda')
optimizer = optim.Adam(list(model.parameters()) + list(predictor.parameters()))

In [None]:
best_accuracy = 0 
best_model_path = 'model.pt'
for epoch in range(10):
    model.train()
    with tqdm.tqdm(train_dataloader) as tq:
        for step, (input_nodes, pos_graph, neg_graph, bipartites) in enumerate(tq):
            bipartites = [b.to('cuda') for b in bipartites]
            pos_graph = pos_graph.to('cuda')
            neg_graph = neg_graph.to('cuda')
            inputs = node_features[input_nodes].to('cuda')
            outputs = model(bipartites, inputs)
            pos_score = predictor(pos_graph, outputs)
            neg_score = predictor(neg_graph, outputs)