In [1]:
import os

In [2]:
os.chdir('../..')

In [3]:
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn.functional as F
import dgl
from ogb.nodeproppred import DglNodePropPredDataset, Evaluator
import faiss

from dgl.nn.pytorch.conv import SAGEConv

Using backend: pytorch


In [4]:
dataset = DglNodePropPredDataset(name='ogbn-arxiv', 
                                 root='data/dataset_dgl/')
dataset

DglNodePropPredDataset(1)

In [5]:
torch.cuda.is_available()

True

In [6]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

In [7]:
graph = dataset[0][0]
graph

Graph(num_nodes=169343, num_edges=1166243,
      ndata_schemes={'year': Scheme(shape=(1,), dtype=torch.int64), 'feat': Scheme(shape=(128,), dtype=torch.float32)}
      edata_schemes={})

In [8]:
torch.mean(graph.out_degrees() + graph.in_degrees() * 1.)

tensor(13.7737)

## Build Graph from graph sage embedding distances

In [10]:
output_emb_file = 'models/graphsage_link_pred/full_graphsage_linkpred_46_h.npy'
max_k = 1000
min_k = 1
min_cossim = 0.9

In [11]:
embeddings = np.load(output_emb_file)

In [12]:
emb_norm = embeddings / np.linalg.norm(embeddings, axis=1)[:, None]

In [13]:
index_cos = faiss.IndexFlatIP(emb_norm.shape[1])

In [14]:
index_cos.train(emb_norm)
index_cos.add(emb_norm)

In [15]:
index_cos.ntotal

169343

In [16]:
distances, indices = index_cos.search(emb_norm, max_k + 1)

In [17]:
mask = (distances > min_cossim)[:, 1:]
mask[:, :min_k] = True

In [18]:
mask.sum(axis=1)

array([938,   6, 255, ...,  43, 805,  29])

In [19]:
mask.sum(axis=1).min()

1

In [20]:
u = np.repeat(indices[:, 0], mask.sum(axis=1))
print(u.shape)
u

(12221268,)


array([     0,      0,      0, ..., 169342, 169342, 169342])

In [21]:
v = indices[:,1:].reshape(-1)[mask.reshape(-1)]
print(v.shape)
v

(12221268,)


array([ 79872,  10839, 158210, ...,  97498,  90378,  89648])

In [22]:
graph.remove_edges(graph.edge_ids(graph.edges()[0], graph.edges()[1]))

In [23]:
graph.edges()

(tensor([], dtype=torch.int64), tensor([], dtype=torch.int64))

In [24]:
graph.add_edges(u, v)

In [25]:
graph.edges()

(tensor([     0,      0,      0,  ..., 169342, 169342, 169342]),
 tensor([ 79872,  10839, 158210,  ...,  97498,  90378,  89648]))

## Model

In [26]:
split_idx = dataset.get_idx_split()

In [27]:
torch.cuda.set_device(device)

In [28]:
features = graph.ndata['feat'].cuda()
labels = dataset.labels.cuda()
train_mask = split_idx['train'].cuda()
val_mask = split_idx['valid'].cuda()
test_mask = split_idx['test'].cuda()

In [29]:
train_nid = train_mask.nonzero().squeeze()
val_nid = val_mask.nonzero().squeeze()
test_nid = test_mask.nonzero().squeeze()

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  train_nid = train_mask.nonzero().squeeze()


In [30]:
n_edges = graph.number_of_edges()

In [31]:
graph = graph.int().to(device)

In [32]:
n_iters = 3000
epochs = 1000
log_steps = 100
input_dim = features.shape[1]
hidden_channels = input_dim * 2
output_dim = dataset.num_classes
lr_rate = 0.001

In [33]:
class ThreeLayer(torch.nn.Module):
    def __init__(self, input_dim, hidden_channels, output_dim, dropout):
        super(ThreeLayer, self).__init__()
        
        self.convs = torch.nn.ModuleList()
        self.bns = torch.nn.ModuleList()
        
        self.convs.append(SAGEConv(input_dim, hidden_channels, 'mean'))
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        
        self.convs.append(SAGEConv(hidden_channels, hidden_channels, 'mean'))
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        
        self.convs.append(SAGEConv(hidden_channels, output_dim, 'mean'))
        
        self.dropout = dropout
        

    def forward(self, graph, x):
        x = self.convs[0](graph, x)
        x = self.bns[0](x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.convs[1](graph, x)
        x = self.bns[1](x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.convs[2](graph, x)
        return x.log_softmax(dim=-1)

In [34]:
model = ThreeLayer(input_dim,
                   hidden_channels,
                   output_dim,
                   0.5).cuda()

In [35]:
def train(model, graph, features, train_mask, optimizer):
    model.train()

    optimizer.zero_grad()
    out = model(graph, features)[train_mask]
    loss = F.nll_loss(out, labels.squeeze(1)[train_mask])
    loss.backward()
    optimizer.step()

    return loss.item()

In [36]:
@torch.no_grad()
def test(model, graph, features, labels, train_mask, val_mask, test_mask, evaluator):
    model.eval()

    out = model(graph, features)
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': labels[train_mask],
        'y_pred': y_pred[train_mask],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': labels[val_mask],
        'y_pred': y_pred[val_mask],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': labels[test_mask],
        'y_pred': y_pred[test_mask],
    })['acc']

    return train_acc, valid_acc, test_acc

In [37]:
evaluator = Evaluator(name='ogbn-arxiv')

In [38]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
for epoch in range(1, 1 + epochs):
    loss = train(model, graph, features, train_mask, optimizer)
    result = test(model, graph, features, labels, train_mask, val_mask, test_mask, evaluator)

    if epoch % log_steps == 0:
        train_acc, valid_acc, test_acc = result
        print(f'Epoch: {epoch:02d}, '
              f'Loss: {loss:.4f}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')

Epoch: 100, Loss: 1.4220, Train: 66.24%, Valid: 65.66% Test: 63.98%
Epoch: 200, Loss: 1.2235, Train: 68.88%, Valid: 67.38% Test: 66.01%
Epoch: 300, Loss: 1.1276, Train: 70.44%, Valid: 68.03% Test: 66.81%
Epoch: 400, Loss: 1.0681, Train: 71.54%, Valid: 68.38% Test: 67.23%
Epoch: 500, Loss: 1.0217, Train: 72.55%, Valid: 68.64% Test: 67.51%
Epoch: 600, Loss: 0.9900, Train: 73.50%, Valid: 68.91% Test: 67.60%
Epoch: 700, Loss: 0.9555, Train: 74.36%, Valid: 69.16% Test: 68.05%
Epoch: 800, Loss: 0.9255, Train: 75.22%, Valid: 69.27% Test: 68.05%
Epoch: 900, Loss: 0.8994, Train: 76.03%, Valid: 69.61% Test: 68.31%
Epoch: 1000, Loss: 0.8777, Train: 76.84%, Valid: 69.58% Test: 68.33%


In [40]:
extra_epochs = 1000
for epoch in range(1 + epochs, 1 + epochs + extra_epochs):
    loss = train(model, graph, features, train_mask, optimizer)
    result = test(model, graph, features, labels, train_mask, val_mask, test_mask, evaluator)

    if epoch % log_steps == 0:
        train_acc, valid_acc, test_acc = result
        print(f'Epoch: {epoch:02d}, '
              f'Loss: {loss:.4f}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')

Epoch: 1100, Loss: 0.8497, Train: 77.55%, Valid: 69.74% Test: 68.29%
Epoch: 1200, Loss: 0.8392, Train: 78.26%, Valid: 69.67% Test: 68.37%
Epoch: 1300, Loss: 0.8177, Train: 78.92%, Valid: 69.74% Test: 68.28%
Epoch: 1400, Loss: 0.7998, Train: 79.57%, Valid: 69.57% Test: 68.24%
Epoch: 1500, Loss: 0.7869, Train: 80.07%, Valid: 69.61% Test: 68.26%
Epoch: 1600, Loss: 0.7718, Train: 80.61%, Valid: 69.51% Test: 68.21%
Epoch: 1700, Loss: 0.7582, Train: 81.07%, Valid: 69.49% Test: 67.97%
Epoch: 1800, Loss: 0.7443, Train: 81.54%, Valid: 69.66% Test: 68.15%
Epoch: 1900, Loss: 0.7349, Train: 81.91%, Valid: 69.55% Test: 68.12%
Epoch: 2000, Loss: 0.7276, Train: 82.31%, Valid: 69.84% Test: 68.52%
