In [None]:
!pip install Ninja



In [None]:
import torch

#CUDA SetUp

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-9vj9kwo8
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-9vj9kwo8
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpfzqkpl_2".


In [None]:
!git clone "https://github.com/charlifu/TLPGNN.git"

Cloning into 'TLPGNN'...
remote: Enumerating objects: 52, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 52 (delta 9), reused 16 (delta 5), pack-reused 32[K
Receiving objects: 100% (52/52), 59.75 MiB | 32.25 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [None]:
%cd TLPGNN

/content/TLPGNN


In [None]:
import argparse, time
import numpy as np
import networkx as nx
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.profiler as profiler
import scipy.sparse as sp
from torch.utils.cpp_extension import load_inline

In [None]:
import argparse, time
import numpy as np
import networkx as nx
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.profiler as profiler
import scipy.sparse as sp
from torch.utils.cpp_extension import load_inline

def read_data(dataset):
    data_path = "/content/TLPGNN/data/citeseer/"
    ret = {}
    ret['features'] = np.load(data_path+'features.npy')
    ret['graph'] = sp.load_npz(data_path+'csr.npz').tocsc()
    ret['graph'].sort_indices()
    return ret

cpp_source = '''
#include <vector>


std::vector<torch::Tensor> gcn_conv_cuda_forward(
        torch::Tensor features,
        torch::Tensor col_starts,
        torch::Tensor rows);

std::vector<torch::Tensor> gcn_conv_cuda_backward(
        torch::Tensor features,
        torch::Tensor grad,
        torch::Tensor indegs,
        torch::Tensor row_starts,
        torch::Tensor cols);

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

std::vector<torch::Tensor> gcn_conv_forward(
        torch::Tensor features,
        torch::Tensor col_starts,
        torch::Tensor rows)
{
    CHECK_INPUT(features);
    CHECK_INPUT(col_starts);
    CHECK_INPUT(rows);

    return gcn_conv_cuda_forward(features, col_starts, rows);
}

std::vector<torch::Tensor> gcn_conv_backward(
        torch::Tensor features,
        torch::Tensor grad,
        torch::Tensor indegs,
        torch::Tensor row_starts,
        torch::Tensor cols)
{
    CHECK_INPUT(features);
    CHECK_INPUT(grad);
    CHECK_INPUT(indegs);
    CHECK_INPUT(row_starts);
    CHECK_INPUT(cols);

    return gcn_conv_cuda_backward(features, grad, indegs, row_starts, cols);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gcn_conv_forward, "GCN conv forward (CUDA)");
    m.def("backward", &gcn_conv_backward, "GCN conv backward (CUDA)");
}
'''

cuda_source = open("/content/TLPGNN/gcn/naive_kernel.cu").read()

gcn_module = load_inline(name="cn",
        cpp_sources=[cpp_source],
        cuda_sources=[cuda_source],
        extra_cuda_cflags=['-Xptxas -O3 -m 64'],
        verbose=False)


def main(dataset="citeseer", size=32, gpu=0):
    th.cuda.set_device(gpu)
    data = read_data(dataset)
    features = th.cuda.FloatTensor(data['features'][:,0:size])
    indptr = data['graph'].indptr
    #indegs = th.cuda.FloatTensor([indptr[i+1] - indptr[i] for i in range(len(indptr)-1)])
    col_starts = th.cuda.IntTensor(indptr)
    rows = th.cuda.IntTensor(data['graph'].indices)
    #row_starts = th.cuda.IntTensor(data['graph'].tocsr().indptr)
    #cols = th.cuda.IntTensor(data['graph'].tocsr().indices)

    gcn_module.forward(features, col_starts, rows)
    th.cuda.synchronize()

    run_time = 0.0
    for _ in range(10):
        start_run = time.perf_counter()
        rst = gcn_module.forward(features, col_starts, rows)
        th.cuda.synchronize()
        run_time += (time.perf_counter() - start_run)

    print('Time (ms): {:.3f}'.format(run_time*1e3/10))

    return run_time * 1e3 / 10
    # t = time.time()
    # rst = gcn_module.backward(features, th.ones_like(features), indegs, row_starts, cols)
    # th.cuda.synchronize()
    # print(time.time() - t)
    # print(rst[0])

if __name__ == "__main__":
    main()


[tensor([[-0.2550,  1.2813, -0.2190,  ...,  0.6278, -0.5823,  1.4813],
        [-0.4947,  0.5090,  0.7407,  ...,  0.7368, -0.2246, -1.1289],
        [-0.7635, -0.5570,  0.0670,  ...,  0.9008,  0.0640,  2.0496],
        ...,
        [-0.0403,  0.3058,  1.1747,  ...,  0.7907,  0.1276, -0.8182],
        [ 0.6092,  0.6710, -1.5096,  ...,  0.6340, -0.5538, -1.3590],
        [-0.5272, -0.7489, -0.3915,  ..., -0.6998, -0.3895, -1.1157]],
       device='cuda:0')]
[tensor([[-0.2550,  1.2813, -0.2190,  ...,  0.6278, -0.5823,  1.4813],
        [-0.4947,  0.5090,  0.7407,  ...,  0.7368, -0.2246, -1.1289],
        [-0.7635, -0.5570,  0.0670,  ...,  0.9008,  0.0640,  2.0496],
        ...,
        [-0.0403,  0.3058,  1.1747,  ...,  0.7907,  0.1276, -0.8182],
        [ 0.6092,  0.6710, -1.5096,  ...,  0.6340, -0.5538, -1.3590],
        [-0.5272, -0.7489, -0.3915,  ..., -0.6998, -0.3895, -1.1157]],
       device='cuda:0')]
[tensor([[-0.2550,  1.2813, -0.2190,  ...,  0.6278, -0.5823,  1.4813],
        [

In [None]:
!touch data.json

In [None]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <json-c/json.h>
#include <cuda.h>

#define DATA_FILE "data.json"

typedef struct {
    float *features;
    int *col_starts;
    int *rows;
    int n_vertex;
    int fsize;
} Dataset;

Dataset load_dataset(const char *filename) {
    Dataset dataset;
    dataset.features = NULL;
    dataset.col_starts = NULL;
    dataset.rows = NULL;
    dataset.n_vertex = 0;
    dataset.fsize = 0;

    // Read JSON file
    FILE *file = fopen(filename, "r");
    if (!file) {
        fprintf(stderr, "Failed to open file %s\n", filename);
        exit(EXIT_FAILURE);
    }

    fseek(file, 0, SEEK_END);
    long file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    char *json_str = (char *)malloc(file_size + 1);
    if (!json_str) {
        fclose(file);
        fprintf(stderr, "Memory allocation failed\n");
        exit(EXIT_FAILURE);
    }

    fread(json_str, 1, file_size, file);
    fclose(file);
    json_str[file_size] = '\0';

    // Parse JSON
    struct json_object *root = json_tokener_parse(json_str);
    free(json_str);

    // Extract data
    struct json_object *features_obj, *graph_obj;
    json_object_object_get_ex(root, "features", &features_obj);
    json_object_object_get_ex(root, "graph", &graph_obj);

    // Extract features
    int features_len = json_object_array_length(features_obj);
    dataset.features = (float *)malloc(features_len * sizeof(float));
    if (!dataset.features) {
        json_object_put(root);
        fprintf(stderr, "Memory allocation failed\n");
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < features_len; ++i) {
        struct json_object *item = json_object_array_get_idx(features_obj, i);
        dataset.features[i] = json_object_get_double(item);
    }

    // Extract graph
    struct json_object *col_starts_obj, *rows_obj;
    json_object_object_get_ex(graph_obj, "col_starts", &col_starts_obj);
    json_object_object_get_ex(graph_obj, "rows", &rows_obj);

    int col_starts_len = json_object_array_length(col_starts_obj);
    int rows_len = json_object_array_length(rows_obj);

    dataset.col_starts = (int *)malloc(col_starts_len * sizeof(int));
    dataset.rows = (int *)malloc(rows_len * sizeof(int));

    if (!dataset.col_starts || !dataset.rows) {
        free(dataset.features);
        json_object_put(root);
        fprintf(stderr, "Memory allocation failed\n");
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < col_starts_len; ++i) {
        struct json_object *item = json_object_array_get_idx(col_starts_obj, i);
        dataset.col_starts[i] = json_object_get_int(item);
    }

    for (int i = 0; i < rows_len; ++i) {
        struct json_object *item = json_object_array_get_idx(rows_obj, i);
        dataset.rows[i] = json_object_get_int(item);
    }

    dataset.n_vertex = rows_len; // Assuming rows length is the number of vertices
    dataset.fsize = features_len / rows_len; // Assuming features length is a multiple of rows length

    // Clean up
    json_object_put(root);

    return dataset;
}

__global__ void gcn_conv_cuda_forward_kernel(
        const int n_vertex,
        const int fsize,
        float *features,
        int *col_starts,
        int *rows,
        float *result) {

    int des_v = blockIdx.x * blockDim.y + threadIdx.y;

    if (des_v < n_vertex)
    {
        float ret;

        int s_pos = col_starts[des_v];
        int e_pos = col_starts[des_v+1];

        float deg = 1.0 / (e_pos - s_pos);
        float *des_p = result + des_v * fsize;
        for (int k = threadIdx.x; k < fsize; k += blockDim.x) {
            ret = 0.0;
            for (int i = s_pos; i < e_pos; ++i)
            {
                ret += __ldg(features + rows[i] * fsize + k);
            }
            des_p[k] = ret * deg;
        }
    }
}

int main() {
    // Load dataset
    Dataset dataset = load_dataset(DATA_FILE);

    // Print dataset info
    printf("Number of vertices: %d\n", dataset.n_vertex);
    printf("Feature size: %d\n", dataset.fsize);

    // Allocate device memory
    float *d_features, *d_result;
    int *d_col_starts, *d_rows;
    cudaMalloc((void **)&d_features, dataset.n_vertex * dataset.fsize * sizeof(float));
    cudaMalloc((void **)&d_result, dataset.n_vertex * dataset.fsize * sizeof(float));
    cudaMalloc((void **)&d_col_starts, (dataset.n_vertex + 1) * sizeof(int));
    cudaMalloc((void **)&d_rows, dataset.n_vertex * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_features, dataset.features, dataset.n_vertex * dataset.fsize * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_col_starts, dataset.col_starts, (dataset.n_vertex + 1) * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_rows, dataset.rows, dataset.n_vertex * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    int num_blocks = (dataset.n_vertex + 3) / 4; // Adjust based on block size
    dim3 threads_per_block(32, 4);
    gcn_conv_cuda_forward_kernel<<<num_blocks, threads_per_block>>>(dataset.n_vertex, dataset.fsize, d_features, d_col_starts, d_rows, d_result);
    cudaDeviceSynchronize();

    // Free device memory
    cudaFree(d_features);
    cudaFree(d_result);
    cudaFree(d_col_starts);
    cudaFree(d_rows);

    // Free host memory
    free(dataset.features);
    free(dataset.col_starts);
    free(dataset.rows);

    return 0;
}


/usr/bin/ld: /tmp/tmpxft_000024f8_00000000-11_single_file.o: in function `load_dataset(char const*)':
tmpxft_000024f8_00000000-6_single_file.cudafe1.cpp:(.text+0x1ad): undefined reference to `json_tokener_parse'
/usr/bin/ld: tmpxft_000024f8_00000000-6_single_file.cudafe1.cpp:(.text+0x1d7): undefined reference to `json_object_object_get_ex'
/usr/bin/ld: tmpxft_000024f8_00000000-6_single_file.cudafe1.cpp:(.text+0x1f1): undefined reference to `json_object_object_get_ex'
/usr/bin/ld: tmpxft_000024f8_00000000-6_single_file.cudafe1.cpp:(.text+0x1fd): undefined reference to `json_object_array_length'
/usr/bin/ld: tmpxft_000024f8_00000000-6_single_file.cudafe1.cpp:(.text+0x239): undefined reference to `json_object_put'
/usr/bin/ld: tmpxft_000024f8_00000000-6_single_file.cudafe1.cpp:(.text+0x28a): undefined reference to `json_object_array_get_idx'
/usr/bin/ld: tmpxft_000024f8_00000000-6_single_file.cudafe1.cpp:(.text+0x29a): undefined reference to `json_object_get_double'
/usr/bin/ld: tmpxft_00

In [None]:
!pip install dgl

Collecting dgl
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
pip install dgl

Collecting dgl
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
!git clone "https://github.com/dmlc/dgl.git"

Cloning into 'dgl'...
remote: Enumerating objects: 53072, done.[K
remote: Counting objects: 100% (4403/4403), done.[K
remote: Compressing objects: 100% (412/412), done.[K
remote: Total 53072 (delta 4150), reused 4071 (delta 3985), pack-reused 48669[K
Receiving objects: 100% (53072/53072), 27.91 MiB | 8.44 MiB/s, done.
Resolving deltas: 100% (35872/35872), done.


In [None]:
%cd dgl

/content/dgl


In [None]:
!python3 "/content/dgl/examples/pytorch/gcn/train.py" --dataset cora

DGL backend not selected or invalid.  Assuming PyTorch for now.
Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)
Training with DGL built-in GraphConv module.
Downloading /root/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...
/root/.dgl/cora_v2.zip: 100% 132k/132k [00:00<00:00, 424kB/s] 
Extracting file to /root/.dgl/cora_v2_d697a464
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Traceback (most recent call last):
  File "/content/dgl/examples/pytorch/gcn/train.py", line 98, in <module>
    g = g.int().to(device)
  File "/usr/local/lib/python3.10/dist-packages/dgl/heterograph.py", line 5714, in to
    ret._graph = self._graph.copy_to(utils.to_dgl_con

In [None]:
    def forward(self, g, features):
        h = features
        run_times = []
        for i, layer in enumerate(self.layers):
            if i != 0:
                h = self.dropout(h)
            with torch.autograd.profiler.profile(use_cuda=True) as prof:
                h = layer(g, h)
            run_time = 0.0
            for event in prof.function_events:
                if event.name == 'GCNConv':
                    run_time = event.self_cpu_time_total / 1000.0  # Convert to milliseconds
            run_times.append(run_time)
        return h, run_times

dgl  my.cu  sample_data  TLPGNN


In [None]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    h = layer(g, h)

print(prof.key_averages().table(sort_by="self_cpu_time_total"))
Epoch 00170 | Loss 0.3201 | Accuracy 0.7780
-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                    aten::mul        49.99%       4.399ms        49.99%       4.399ms       2.200ms       4.432ms        50.17%       4.432ms       2.216ms             2
                     aten::mm        30.91%       2.720ms        30.98%       2.726ms       2.726ms       2.733ms        30.94%       2.744ms       2.744ms             1
              cudaEventRecord         3.88%     341.000us         3.88%     341.000us       1.894us       0.000us         0.00%       0.000us       0.000us           180
                        GSpMM         3.11%     274.000us         4.36%     384.000us     384.000us     279.000us         3.16%     392.000us     392.000us             1
                    aten::pow         1.88%     165.000us         2.20%     194.000us      97.000us     100.000us         1.13%     204.000us     102.000us             2
                 aten::arange         1.58%     139.000us         2.88%     253.000us      42.167us     147.000us         1.66%     280.000us      46.667us             6
               aten::_to_copy         1.36%     120.000us         2.19%     193.000us      38.600us     119.000us         1.35%     214.000us      42.800us             5
                    aten::add         0.60%      53.000us         0.60%      53.000us      53.000us      61.000us         0.69%      61.000us      61.000us             1
             aten::as_strided         0.52%      46.000us         0.52%      46.000us       5.750us      85.000us         0.96%      85.000us      10.625us             8
                     aten::to         0.51%      45.000us         2.85%     251.000us      27.889us      75.000us         0.85%     289.000us      32.111us             9
                    aten::min         0.50%      44.000us         1.08%      95.000us      95.000us      43.000us         0.49%      98.000us      98.000us             1
                  aten::clamp         0.41%      36.000us         0.56%      49.000us      24.500us      58.000us         0.66%      67.000us      33.500us             2
                    aten::any         0.40%      35.000us         0.51%      45.000us      45.000us      38.000us         0.43%      50.000us      50.000us             1
                     aten::eq         0.36%      32.000us         1.00%      88.000us      88.000us      37.000us         0.42%      94.000us      94.000us             1
                  aten::copy_         0.35%      31.000us         0.35%      31.000us       6.200us      55.000us         0.62%      55.000us      11.000us             5
                  aten::fill_         0.34%      30.000us         0.34%      30.000us      10.000us      43.000us         0.49%      43.000us      14.333us             3
                    aten::sum         0.33%      29.000us         0.80%      70.000us      70.000us      29.000us         0.33%      74.000us      74.000us             1
              aten::clamp_min         0.32%      28.000us         0.32%      28.000us      28.000us      54.000us         0.61%      54.000us      54.000us             1
                   aten::relu         0.31%      27.000us         0.91%      80.000us      80.000us      34.000us         0.38%      88.000us      88.000us             1
                   aten::item         0.27%      24.000us         0.41%      36.000us      12.000us      31.000us         0.35%      47.000us      15.667us             3
                 aten::matmul         0.26%      23.000us        31.32%       2.756ms       2.756ms      17.000us         0.19%       2.761ms       2.761ms             1
                aten::reshape         0.24%      21.000us         0.44%      39.000us      19.500us      27.000us         0.31%      49.000us      24.500us             2
                  aten::zeros         0.23%      20.000us         0.80%      70.000us      70.000us      19.000us         0.22%      74.000us      74.000us             1
             aten::unsqueeze_         0.17%      15.000us         0.26%      23.000us      11.500us      17.000us         0.19%      29.000us      14.500us             2
               aten::squeeze_         0.17%      15.000us         0.23%      20.000us      10.000us      17.000us         0.19%      26.000us      13.000us             2
                  aten::empty         0.16%      14.000us         0.16%      14.000us       3.500us      34.000us         0.38%      34.000us       8.500us             4
        cudaDeviceSynchronize         0.16%      14.000us         0.16%      14.000us      14.000us       0.000us         0.00%       0.000us       0.000us             1
                   aten::view         0.14%      12.000us         0.14%      12.000us       6.000us      22.000us         0.25%      22.000us      11.000us             2
          aten::empty_strided         0.13%      11.000us         0.13%      11.000us       2.200us      40.000us         0.45%      40.000us       8.000us             5
             aten::is_nonzero         0.11%      10.000us         0.34%      30.000us      30.000us      15.000us         0.17%      36.000us      36.000us             1
                  aten::zero_         0.11%      10.000us         0.47%      41.000us      41.000us      11.000us         0.12%      45.000us      45.000us             1
                aten::resize_         0.08%       7.000us         0.08%       7.000us       2.333us      22.000us         0.25%      22.000us       7.333us             3
    aten::_local_scalar_dense         0.05%       4.000us         0.05%       4.000us       1.333us      16.000us         0.18%      16.000us       5.333us             3
            aten::as_strided_         0.05%       4.000us         0.05%       4.000us       1.000us      21.000us         0.24%      21.000us       5.250us             4
            aten::result_type         0.01%       1.000us         0.01%       1.000us       0.500us      92.000us         1.04%      92.000us      46.000us             2
           aten::resolve_conj         0.00%       0.000us         0.00%       0.000us       0.000us      11.000us         0.12%      11.000us       3.667us             3
-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
Self CPU time total: 8.799ms
Self CUDA time total: 8.834ms