In [1]:
!pip install Ninja

Collecting Ninja
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/307.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m297.0/307.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ninja
Successfully installed Ninja-1.11.1.1


In [2]:
import torch

#CUDA SetUp

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [4]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-yjns9cst
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-yjns9cst
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10742 sha256=655723c0d2f4841dd76e45e772f96358a5fda83aba7888a669bf1eb2bb64257b
  Stored in directory: /tmp/pip-ephem-wheel-cache-h1a35k5u/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [5]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpkuyzir7_".


In [6]:
!git clone "https://github.com/charlifu/TLPGNN.git"

Cloning into 'TLPGNN'...
remote: Enumerating objects: 52, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 52 (delta 9), reused 16 (delta 5), pack-reused 32[K
Receiving objects: 100% (52/52), 59.75 MiB | 14.10 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [7]:
%cd TLPGNN

/content/TLPGNN


In [9]:
import argparse, time
import numpy as np
import networkx as nx
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.profiler as profiler
import scipy.sparse as sp
from torch.utils.cpp_extension import load_inline

def read_data(dataset):
    data_path = "/content/TLPGNN/data/citeseer/"
    ret = {}
    ret['features'] = np.load(data_path+'features.npy')
    ret['graph'] = sp.load_npz(data_path+'csr.npz').tocsc()
    ret['graph'].sort_indices()
    return ret

cpp_source = '''
#include <vector>


std::vector<torch::Tensor> gcn_conv_cuda_forward(
        torch::Tensor features,
        torch::Tensor col_starts,
        torch::Tensor rows);

std::vector<torch::Tensor> gcn_conv_cuda_backward(
        torch::Tensor features,
        torch::Tensor grad,
        torch::Tensor indegs,
        torch::Tensor row_starts,
        torch::Tensor cols);

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

std::vector<torch::Tensor> gcn_conv_forward(
        torch::Tensor features,
        torch::Tensor col_starts,
        torch::Tensor rows)
{
    CHECK_INPUT(features);
    CHECK_INPUT(col_starts);
    CHECK_INPUT(rows);

    return gcn_conv_cuda_forward(features, col_starts, rows);
}

std::vector<torch::Tensor> gcn_conv_backward(
        torch::Tensor features,
        torch::Tensor grad,
        torch::Tensor indegs,
        torch::Tensor row_starts,
        torch::Tensor cols)
{
    CHECK_INPUT(features);
    CHECK_INPUT(grad);
    CHECK_INPUT(indegs);
    CHECK_INPUT(row_starts);
    CHECK_INPUT(cols);

    return gcn_conv_cuda_backward(features, grad, indegs, row_starts, cols);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gcn_conv_forward, "GCN conv forward (CUDA)");
    m.def("backward", &gcn_conv_backward, "GCN conv backward (CUDA)");
}
'''

cuda_source = open("/content/TLPGNN/gcn/naive_kernel.cu").read()

gcn_module = load_inline(name="cn",
        cpp_sources=[cpp_source],
        cuda_sources=[cuda_source],
        extra_cuda_cflags=['-Xptxas -O3 -m 64'],
        verbose=False)


def main(dataset="citeseer", size=32, gpu=0):
    th.cuda.set_device(gpu)
    data = read_data(dataset)
    features = torch.tensor(data['features'][:], dtype=torch.float32, device='cuda')
    #features = th.cuda.FloatTensor(data['features'][:,0:size])
    indptr = data['graph'].indptr
    #indegs = th.cuda.FloatTensor([indptr[i+1] - indptr[i] for i in range(len(indptr)-1)])
    col_starts = th.cuda.IntTensor(indptr)
    rows = th.cuda.IntTensor(data['graph'].indices)
    #row_starts = th.cuda.IntTensor(data['graph'].tocsr().indptr)
    #cols = th.cuda.IntTensor(data['graph'].tocsr().indices)

    gcn_module.forward(features, col_starts, rows)
    th.cuda.synchronize()

    run_time = 0.0
    for _ in range(10):
        start_run = time.perf_counter()
        rst = gcn_module.forward(features, col_starts, rows)
        print(rst[0].size())
        th.cuda.synchronize()
        run_time += (time.perf_counter() - start_run)

    print('Time (ms): {:.3f}'.format(run_time*1e3/10))

    return run_time * 1e3 / 10
    # t = time.time()
    # rst = gcn_module.backward(features, th.ones_like(features), indegs, row_starts, cols)
    # th.cuda.synchronize()
    # print(time.time() - t)
    # print(rst[0])

if __name__ == "__main__":
    main()


torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
torch.Size([3327, 3703])
Time (ms): 1.995


  col_starts = th.cuda.IntTensor(indptr)


## GAT

In [10]:
import argparse, time
import numpy as np
import networkx as nx
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.profiler as profiler
import scipy.sparse as sp
from torch.utils.cpp_extension import load_inline

def read_data(dataset):
    data_path = "/content/TLPGNN/data/citeseer/"
    ret = {}
    ret['features'] = np.load(data_path+'features.npy')
    ret['graph'] = sp.load_npz(data_path+'csr.npz').tocsc()
    ret['graph'].sort_indices()
    return ret

cpp_source = '''
#include <vector>

std::vector<torch::Tensor> gat_conv_cuda_forward(
        torch::Tensor features,
        torch::Tensor el,
        torch::Tensor er,
        torch::Tensor col_starts,
        torch::Tensor rows);

std::vector<torch::Tensor> gat_conv_cuda_backward(
        torch::Tensor features,
        torch::Tensor el,
        torch::Tensor er,
        torch::Tensor grad,
        torch::Tensor row_starts,
        torch::Tensor cols);

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

std::vector<torch::Tensor> gat_conv_forward(
        torch::Tensor features,
        torch::Tensor el,
        torch::Tensor er,
        torch::Tensor col_starts,
        torch::Tensor rows)
{
    CHECK_INPUT(features);
    CHECK_INPUT(col_starts);
    CHECK_INPUT(rows);
    CHECK_INPUT(el);
    CHECK_INPUT(er);

    return gat_conv_cuda_forward(features, el, er, col_starts, rows);
}

// std::vector<torch::Tensor> gat_conv_backward(
//         torch::Tensor features,
//         torch::Tensor el,
//         torch::Tensor er,
//         torch::Tensor grad,
//         torch::Tensor row_starts,
//         torch::Tensor cols)
// {
//     CHECK_INPUT(features);
//     CHECK_INPUT(row_starts);
//     CHECK_INPUT(cols);
//     CHECK_INPUT(el);
//     CHECK_INPUT(er);
//     CHECK_INPUT(grad);
//
//     return gat_conv_cuda_backward(features, el, er, grad, row_starts, cols);
// }

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gat_conv_forward, "GAT aggregate forward (CUDA)");
    // m.def("backward", &gat_conv_backward, "GAT aggregate backward (CUDA)");
}
'''

cuda_source = open("/content/TLPGNN/gat/kernel.cu").read()

gat_module = load_inline(name="gat",
        cpp_sources=[cpp_source],
        cuda_sources=[cuda_source],
        extra_cuda_cflags=['-m 64'],
        verbose=False)


def main(dataset="citeseer", size=3703, gpu=0,heads=1):
    th.cuda.set_device(gpu)

    # load and preprocess dataset
    data = read_data(dataset)
    indptr = data['graph'].indptr
    col_starts = th.cuda.IntTensor(indptr)
    rows = th.cuda.IntTensor(data['graph'].indices)

    features = th.cuda.FloatTensor(data['features'][:,0:size*heads]).view(-1, heads, size)
    el = th.cuda.FloatTensor(data['features'][:,10:10+heads])#.unsqueeze(-1)
    er = th.cuda.FloatTensor(data['features'][:,20:20+heads])#.unsqueeze(-1)

    rst = gat_module.forward(features, el, er, col_starts, rows)
    th.cuda.synchronize()

    run_time = 0.0
    for _ in range(10):
        start_run = time.perf_counter()
        gat_module.forward(features, el, er, col_starts, rows)
        th.cuda.synchronize()
        run_time += (time.perf_counter() - start_run)

    print('Time (ms): {:.3f}'.format(run_time*1e3/10))
    return '{:.3f}'.format(run_time*1e3/10)

if __name__ == "__main__":
    main()


Time (ms): 3.286


## DGL Library GAT

In [31]:
!cp "/content/dgl/examples/pytorch/gat/train.py" "/content"

In [42]:
!python3 "/content/train_GAT.py" --dataset citeseer

Training with DGL built-in GATConv module.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Training...
Epoch 00000 | Loss 4.1594 | Accuracy 0.1920 
Forward Time: 0.9447674751281738
Epoch 00001 | Loss 4.1468 | Accuracy 0.2400 
Forward Time: 0.0025954246520996094
Epoch 00002 | Loss 4.1321 | Accuracy 0.3000 
Forward Time: 0.0027472972869873047
Epoch 00003 | Loss 4.1196 | Accuracy 0.3400 
Forward Time: 0.003980159759521484
Epoch 00004 | Loss 4.1047 | Accuracy 0.3620 
Forward Time: 0.0026252269744873047
Epoch 00005 | Loss 4.0886 | Accuracy 0.3760 
Forward Time: 0.0021626949310302734
Epoch 00006 | Loss 4.0798 | Accuracy 0.4060 
Forward Time: 0.002307415008544922
Epoch 00007 | Loss 4.0638 | Accuracy 0.4040 
Forward Time: 0.0024874210357666016
Epoch 00008 | Loss 4.0503 | Accuracy 0.4220 
Forward Time: 0.0027971267700195312
Epoch 00009 | Loss 4.0365 | Accuracy 0.4

## DGL Library GCN

In [13]:
%cd /content

/content


In [21]:
!pip install  dgl -f https://data.dgl.ai/wheels/cu121/repo.html
!pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html

Looking in links: https://data.dgl.ai/wheels/cu121/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/cu121/dgl-2.1.0%2Bcu121-cp310-cp310-manylinux1_x86_64.whl (467.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.5/467.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-2.1.0+cu121
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting isort>=5.10.1 (from dglgo)
  Downloading isort-5.13.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.3/92.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autopep8>=1.6.0 (from dglgo)
  Downloading autopep8-2.1.0-py2.py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [14]:
!git clone "https://github.com/dmlc/dgl.git"

Cloning into 'dgl'...
remote: Enumerating objects: 53142, done.[K
remote: Counting objects: 100% (4307/4307), done.[K
remote: Compressing objects: 100% (448/448), done.[K
remote: Total 53142 (delta 4021), reused 3961 (delta 3852), pack-reused 48835[K
Receiving objects: 100% (53142/53142), 27.87 MiB | 11.68 MiB/s, done.
Resolving deltas: 100% (35889/35889), done.


In [None]:
%cd dgl

/content/dgl


In [41]:
!python3 "/content/train_GCN.py" --dataset citeseer

Training with DGL built-in GraphConv module.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Training...
Epoch 00000 | Loss 1.7914 | Accuracy 0.4300 | Forward Time 0.5103
Epoch 00001 | Loss 1.7576 | Accuracy 0.4900 | Forward Time 0.0009
Epoch 00002 | Loss 1.7243 | Accuracy 0.5380 | Forward Time 0.0005
Epoch 00003 | Loss 1.6916 | Accuracy 0.5560 | Forward Time 0.0005
Epoch 00004 | Loss 1.6594 | Accuracy 0.5700 | Forward Time 0.0005
Epoch 00005 | Loss 1.6280 | Accuracy 0.5840 | Forward Time 0.0005
Epoch 00006 | Loss 1.5973 | Accuracy 0.5920 | Forward Time 0.0005
Epoch 00007 | Loss 1.5675 | Accuracy 0.6020 | Forward Time 0.0005
Epoch 00008 | Loss 1.5384 | Accuracy 0.6060 | Forward Time 0.0005
Epoch 00009 | Loss 1.5102 | Accuracy 0.5980 | Forward Time 0.0005
Time (ms): 51.509
Testing...
Test accuracy 0.6080
