In [42]:
import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

There are 1 GPU(s) available.
We will use the GPU: NVIDIA A10
cuda
Cpu count:  32


In [43]:
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import numpy as np
import torch
import tqdm
import sklearn.metrics

In [44]:
from ogb.nodeproppred import DglNodePropPredDataset
dataset = DglNodePropPredDataset(root = DIR, name="ogbn-arxiv")
graph, node_labels = dataset[0]

dataset[0]

(Graph(num_nodes=169343, num_edges=1166243,
       ndata_schemes={'year': Scheme(shape=(1,), dtype=torch.int64), 'feat': Scheme(shape=(128,), dtype=torch.float32)}
       edata_schemes={}),
 tensor([[ 4],
         [ 5],
         [28],
         ...,
         [10],
         [ 4],
         [ 1]]))

In [45]:
graph, node_labels = dataset[0]
# Add reverse edges since ogbn-arxiv is unidirectional.
graph = dgl.add_reverse_edges(graph)
graph.ndata["label"] = node_labels[:, 0]
print(graph)
print(node_labels)

node_features = graph.ndata["feat"]
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
print("Number of classes:", num_classes)

Graph(num_nodes=169343, num_edges=2332486,
      ndata_schemes={'year': Scheme(shape=(1,), dtype=torch.int64), 'feat': Scheme(shape=(128,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
tensor([[ 4],
        [ 5],
        [28],
        ...,
        [10],
        [ 4],
        [ 1]])
Number of classes: 40


In [46]:
idx_split = dataset.get_idx_split()
train_nids = idx_split["train"]
valid_nids = idx_split["valid"]
test_nids = idx_split["test"]

print(train_nids.shape)
print(valid_nids.shape)
print(test_nids.shape)

torch.Size([90941])
torch.Size([29799])
torch.Size([48603])


In [47]:
sampler = dgl.dataloading.NeighborSampler([4, 4])

train_dataloader = dgl.dataloading.DataLoader(
    # The following arguments are specific to DGL's DataLoader.
    graph,  # The graph
    train_nids,  # The node IDs to iterate over in minibatches
    sampler,  # The neighbor sampler
    device=device,  # Put the sampled MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=1024,  # Batch size
    shuffle=True,  # Whether to shuffle the nodes for every epoch
    drop_last=False,  # Whether to drop the last incomplete batch
    num_workers=0,  # Number of sampler processes
)

In [48]:
input_nodes, output_nodes, mfgs = example_minibatch = next(iter(train_dataloader))

print(example_minibatch)

print(
    "To compute {} nodes' outputs, we need {} nodes' input features".format(
        len(output_nodes), len(input_nodes)
    )
)



[tensor([ 10559, 142966, 158330,  ..., 123986,  90269,  32594], device='cuda:0'), tensor([ 10559, 142966, 158330,  ..., 134556, 140985, 112807], device='cuda:0'), [Block(num_src_nodes=12638, num_dst_nodes=4048, num_edges=14587), Block(num_src_nodes=4048, num_dst_nodes=1024, num_edges=3247)]]
To compute 1024 nodes' outputs, we need 12638 nodes' input features


In [49]:
print(mfgs)

[Block(num_src_nodes=12638, num_dst_nodes=4048, num_edges=14587), Block(num_src_nodes=4048, num_dst_nodes=1024, num_edges=3247)]


In [50]:
mfgs[0].srcdata

{'year': tensor([[2013],
        [2012],
        [2016],
        ...,
        [2016],
        [2015],
        [2018]], device='cuda:0'), 'feat': tensor([[-0.1217,  0.2442, -0.3062,  ...,  0.2842, -0.1261, -0.0599],
        [-0.0203,  0.1895, -0.3183,  ..., -0.1323, -0.1415, -0.3087],
        [-0.1652, -0.1308, -0.2963,  ...,  0.1601, -0.1070, -0.1039],
        ...,
        [-0.1589, -0.0485, -0.0630,  ...,  0.1198,  0.1243, -0.1605],
        [-0.1419, -0.2851, -0.4028,  ...,  0.0036, -0.2797, -0.2738],
        [-0.1922,  0.0355, -0.2366,  ...,  0.0691, -0.0317, -0.2648]],
       device='cuda:0'), 'label': tensor([24, 28, 34,  ..., 16, 24, 16], device='cuda:0'), '_ID': tensor([ 10559, 142966, 158330,  ..., 123986,  90269,  32594], device='cuda:0')}

In [51]:
mfg_0_src = mfgs[0].srcdata[dgl.NID]
mfg_0_dst = mfgs[0].dstdata[dgl.NID]
print(mfg_0_src)
print(mfg_0_dst)
print(torch.equal(mfg_0_src[: mfgs[0].num_dst_nodes()], mfg_0_dst))

tensor([ 10559, 142966, 158330,  ..., 123986,  90269,  32594], device='cuda:0')
tensor([ 10559, 142966, 158330,  ...,  92565,  59255,   9371], device='cuda:0')
True


In [52]:
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import SAGEConv


class Model(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type="mean")
        self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type="mean")
        self.h_feats = h_feats

    def forward(self, mfgs, x):
        # Lines that are changed are marked with an arrow: "<---"

        h_dst = x[: mfgs[0].num_dst_nodes()]  # <---
        h = self.conv1(mfgs[0], (x, h_dst))  # <---
        h = F.relu(h)
        h_dst = h[: mfgs[1].num_dst_nodes()]  # <---
        h = self.conv2(mfgs[1], (h, h_dst))  # <---
        return h


model = Model(num_features, 128, num_classes).to(device)

In [53]:
def train(epochs = 10):
    opt = torch.optim.Adam(model.parameters())
    
    valid_dataloader = dgl.dataloading.DataLoader(
        graph,
        valid_nids,
        sampler,
        batch_size=1024,
        shuffle=False,
        drop_last=False,
        num_workers=0,
        device=device,
    )


    best_accuracy = 0
    best_model_path = DIR+"arxiv-model.pt"
    
    for epoch in range(epochs):
        model.train()

        with tqdm.tqdm(train_dataloader) as tq:
            for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
                # feature copy from CPU to GPU takes place here
                inputs = mfgs[0].srcdata["feat"]
                labels = mfgs[-1].dstdata["label"]

                predictions = model(mfgs, inputs)

                loss = F.cross_entropy(predictions, labels)
                opt.zero_grad()
                loss.backward()
                opt.step()

                accuracy = sklearn.metrics.accuracy_score(
                    labels.cpu().numpy(),
                    predictions.argmax(1).detach().cpu().numpy(),
                )

                tq.set_postfix(
                    {"loss": "%.03f" % loss.item(), "acc": "%.03f" % accuracy},
                    refresh=False,
                )                

        model.eval()

        predictions = []
        labels = []
        with tqdm.tqdm(valid_dataloader) as tq, torch.no_grad():
            for input_nodes, output_nodes, mfgs in tq:
                inputs = mfgs[0].srcdata["feat"]
                labels.append(mfgs[-1].dstdata["label"].cpu().numpy())
                predictions.append(model(mfgs, inputs).argmax(1).cpu().numpy())
            
            predictions = np.concatenate(predictions)
            labels = np.concatenate(labels)
            accuracy = sklearn.metrics.accuracy_score(labels, predictions)
            print("Epoch {} Validation Accuracy {}".format(epoch, accuracy))
            if best_accuracy < accuracy:
                best_accuracy = accuracy
                torch.save(model.state_dict(), best_model_path)

            # Note that this tutorial do not train the whole model to the end.
            #break


In [54]:
train()

100%|██████████| 89/89 [00:00<00:00, 156.29it/s, loss=1.877, acc=0.522]
100%|██████████| 30/30 [00:00<00:00, 122.63it/s]


Epoch 0 Validation Accuracy 0.5567300916138126


100%|██████████| 89/89 [00:00<00:00, 157.78it/s, loss=1.465, acc=0.584]
100%|██████████| 30/30 [00:00<00:00, 182.54it/s]


Epoch 1 Validation Accuracy 0.6106580757743548


100%|██████████| 89/89 [00:00<00:00, 146.45it/s, loss=1.350, acc=0.591]
100%|██████████| 30/30 [00:00<00:00, 223.32it/s]


Epoch 2 Validation Accuracy 0.624316252223229


100%|██████████| 89/89 [00:00<00:00, 107.60it/s, loss=1.209, acc=0.616]
100%|██████████| 30/30 [00:00<00:00, 199.70it/s]


Epoch 3 Validation Accuracy 0.6344508204973321


100%|██████████| 89/89 [00:00<00:00, 161.01it/s, loss=1.138, acc=0.653]
100%|██████████| 30/30 [00:00<00:00, 233.64it/s]


Epoch 4 Validation Accuracy 0.6455250176180408


100%|██████████| 89/89 [00:00<00:00, 150.37it/s, loss=1.196, acc=0.656]
100%|██████████| 30/30 [00:00<00:00, 226.88it/s]


Epoch 5 Validation Accuracy 0.6534111882949092


100%|██████████| 89/89 [00:00<00:00, 132.53it/s, loss=1.052, acc=0.700]
100%|██████████| 30/30 [00:00<00:00, 151.76it/s]


Epoch 6 Validation Accuracy 0.6560958421423537


100%|██████████| 89/89 [00:00<00:00, 162.66it/s, loss=1.183, acc=0.653]
100%|██████████| 30/30 [00:00<00:00, 198.62it/s]


Epoch 7 Validation Accuracy 0.6576395181046344


100%|██████████| 89/89 [00:00<00:00, 132.67it/s, loss=1.162, acc=0.626]
100%|██████████| 30/30 [00:00<00:00, 231.77it/s]


Epoch 8 Validation Accuracy 0.6581093325279371


100%|██████████| 89/89 [00:00<00:00, 172.28it/s, loss=1.035, acc=0.702]
100%|██████████| 30/30 [00:00<00:00, 228.90it/s]

Epoch 9 Validation Accuracy 0.662773918587872





In [55]:
torch.zeros(5,5)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])