In [1]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import Sequential, Linear, ReLU, CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data.dataset import random_split

import GCL.augmentors as A
import GCL.losses as L
from GCL.models import DualBranchContrast

from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.loader import DataLoader
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import dropout_edge, add_random_edge

from tqdm import tqdm
import itertools
import warnings
import sys
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os.path as osp

In [2]:
from MyGCL import GIN, Encoder, LogReg, train, train_classifier, eval_encoder

In [3]:
dataset_name = 'PROTEINS'
train_multiple_classifiers = True

# Hyperparams
lr = 0.01
num_layers = 3
epochs = 20
print(f'======The hyperparams: lr={lr}, num_layers={num_layers}, epochs={epochs}. On dataset:{dataset_name}======')

device = torch.device('cuda')
path = osp.join(osp.expanduser('~'), 'datasets')
dataset = TUDataset(path, name=dataset_name)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
num_features = max(dataset.num_features, 1)
if dataset.num_features==0 :
    print("No node feature, paddings of 1 will be used in GIN when forwarding.")

aug1 = A.Identity()
aug2 = A.RandomChoice([A.RWSampling(num_seeds=1000, walk_length=86),
                       A.EdgeRemoving(pe=0.2),
                       A.NodeDropping(pn=0.2)], 1)

# The graph neural network backbone model to use
gconv = GIN(num_features=num_features, dim=32, num_gc_layers=num_layers).to(device)

encoder_model = Encoder(encoder=gconv, augmentor=(aug1, aug2)).to(device)
contrast_model = DualBranchContrast(loss=L.InfoNCE(tau=0.2), mode='G2G').to(device)
optimizer = Adam(encoder_model.parameters(), lr=lr)


# Train the encoder with full dataset without labels using contrastive learning
with tqdm(total=20, desc='(T)') as pbar:
    for epoch in range(1, epochs + 1):
        loss = train(encoder_model, contrast_model, dataloader, optimizer)
        pbar.set_postfix({'loss': loss})
        pbar.update()

# Save trained model parameters for reproduce if needed
# torch.save(encoder_model.state_dict(), 'Savings/model_params/model.pt')

# Split the dataset into two part for training classifier and final evaluation, train_set can be further divided into training and validation parts
train_set, eval_set = random_split(dataset, [0.9, 0.1])

# Get embeddings for the train_set
encoder_model.eval()
dataloader_train = DataLoader(train_set, batch_size=128, shuffle=True)
embedding_global, y = encoder_model.encoder.get_embeddings(dataloader_train)



(T): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.19it/s, loss=15.9]


In [4]:
classifier = train_classifier(embedding_global, y)
# accuracy on the clean evaluation data
acc_clean, mask = eval_encoder(encoder_model, classifier, eval_set)

In [7]:
len(eval_set)

111

In [15]:
mask.sum()

tensor(88, device='cuda:0')

In [13]:
selected_data = [data for data, mask_value in zip(eval_set, mask) if mask_value]

In [19]:
eval_loader_adv = DataLoader(selected_data, batch_size=64)

In [21]:
for batch in eval_loader_adv:
    print(batch.x.shape)
    print(batch.y.shape)
    print(batch.batch.shape)
    print(batch.edge_index.shape)
    break

torch.Size([2313, 3])
torch.Size([64])
torch.Size([2313])
torch.Size([2, 8954])


In [22]:
for one_graph in selected_data:
    print(one_graph.edge_index.shape)
    updated_edge_index, _ = dropout_edge(one_graph.edge_index, p = 0.5) # Randomly drop edges
    one_graph.put_edge_index(updated_edge_index, layout='coo') # Revise the graph in the dataset
    print(one_graph.edge_index.shape)
    break

torch.Size([2, 76])
torch.Size([2, 38])


In [23]:
selected_data[0].edge_index.shape

torch.Size([2, 38])

In [68]:
mask.unsqueeze(1)

tensor([[  0],
        [  1],
        [  2],
        [  3],
        [  4],
        [  5],
        [  6],
        [  7],
        [  8],
        [  9],
        [ 10],
        [ 11],
        [ 12],
        [ 13],
        [ 15],
        [ 16],
        [ 17],
        [ 18],
        [ 19],
        [ 20],
        [ 21],
        [ 22],
        [ 24],
        [ 28],
        [ 29],
        [ 31],
        [ 32],
        [ 33],
        [ 34],
        [ 36],
        [ 37],
        [ 38],
        [ 39],
        [ 40],
        [ 43],
        [ 45],
        [ 46],
        [ 47],
        [ 48],
        [ 49],
        [ 50],
        [ 51],
        [ 52],
        [ 53],
        [ 56],
        [ 57],
        [ 61],
        [ 62],
        [ 63],
        [ 64],
        [ 66],
        [ 67],
        [ 70],
        [ 71],
        [ 77],
        [ 78],
        [ 79],
        [ 82],
        [ 88],
        [ 89],
        [ 91],
        [ 92],
        [ 93],
        [ 94],
        [ 97],
        [102],
        [1

In [92]:
# Standard case
edge_index = torch.tensor([[0, 1, 1, 2, 2, 3],
                           [1, 0, 2, 1, 3, 2]])
edge_index, added_edges = add_random_edge(edge_index, p=1, force_undirected=True)

In [93]:
edge_index

tensor([[0, 1, 1, 2, 2, 3, 2, 3],
        [1, 0, 2, 1, 3, 2, 3, 2]])

In [94]:
added_edges

tensor([[2, 3],
        [3, 2]])

In [8]:
classifier = train_classifier(embedding_global, y)

In [9]:
classifier

LogReg(
  (fc): Linear(in_features=96, out_features=1002, bias=True)
)

In [10]:
acc = eval_encoder(encoder_model, classifier, eval_set)

In [11]:
acc

tensor(0.7117, device='cuda:0')

In [16]:
for data in dataloader:
    print(data.__dict__)
    print('type(data.x): ',type(data.x))
    print('type(data.edge_index): ',type(data.edge_index))
    print('type(data.batch): ',type(data.batch))
    
    
    break

{'_edge_attr_cls': <class 'torch_geometric.data.data.DataEdgeAttr'>, '_tensor_attr_cls': <class 'torch_geometric.data.data.DataTensorAttr'>, '_store': {'edge_index': tensor([[    0,     0,     1,  ..., 59848, 59849, 59850],
        [  139,   340,   228,  ..., 58533, 58533, 58533]]), 'y': tensor([2, 3, 2, 2, 1, 3, 2, 4, 1, 4, 4, 2, 1, 0, 0, 0, 4, 2, 4, 4, 3, 2, 0, 2,
        4, 3, 2, 1, 3, 4, 2, 1, 3, 4, 2, 4, 4, 1, 4, 2, 0, 0, 0, 4, 4, 1, 2, 0,
        4, 1, 3, 3, 3, 4, 1, 4, 0, 2, 1, 4, 3, 2, 1, 0, 0, 1, 1, 3, 1, 4, 4, 1,
        3, 0, 1, 2, 2, 2, 4, 4, 3, 4, 4, 3, 1, 3, 1, 3, 4, 4, 3, 4, 4, 4, 0, 1,
        0, 2, 1, 2, 4, 4, 4, 3, 0, 3, 3, 0, 4, 3, 0, 1, 2, 0, 2, 3, 0, 1, 1, 3,
        2, 2, 2, 2, 3, 1, 4, 1]), 'num_nodes': 59851, 'batch': tensor([  0,   0,   0,  ..., 127, 127, 127]), 'ptr': tensor([    0,   417,   527,   792,  1005,  2123,  2243,  2715,  2912,  3316,
         3443,  3884,  4183,  4598,  4803,  5551,  6454,  6601,  7020,  7544,
         8065,  8467,  8818,  9124,  92

First play with the dataset and learn about the data structure

In [55]:
dataloader.__dict__

{'follow_batch': None,
 'exclude_keys': None,
 'dataset': PROTEINS(1113),
 'num_workers': 0,
 'prefetch_factor': 2,
 'pin_memory': False,
 'pin_memory_device': '',
 'timeout': 0,
 'worker_init_fn': None,
 '_DataLoader__multiprocessing_context': None,
 '_dataset_kind': 0,
 'batch_size': 128,
 'drop_last': False,
 'sampler': <torch.utils.data.sampler.RandomSampler at 0x7f57dc5e5c60>,
 'batch_sampler': <torch.utils.data.sampler.BatchSampler at 0x7f57dc5e61d0>,
 'generator': None,
 'collate_fn': <torch_geometric.loader.dataloader.Collater at 0x7f57dc6a2500>,
 'persistent_workers': False,
 '_DataLoader__initialized': True,
 '_IterableDataset_len_called': None,
 '_iterator': None}

In [56]:
for data in dataloader:
    print(data.__dict__)
    print('type(data.x): ',type(data.x))
    print('type(data.edge_index): ',type(data.edge_index))
    print('type(data.batch): ',type(data.batch))
    
    
    break


{'_edge_attr_cls': <class 'torch_geometric.data.data.DataEdgeAttr'>, '_tensor_attr_cls': <class 'torch_geometric.data.data.DataTensorAttr'>, '_store': {'edge_index': tensor([[   0,    0,    1,  ..., 4428, 4428, 4428],
        [   1,   24,    0,  ..., 4414, 4424, 4426]]), 'x': tensor([[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.]]), 'y': tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 1, 0, 1, 1, 1]), 'batch': tensor([  0,   0,   0,  ..., 127, 127, 127]), 'ptr': tensor([   0,   42,   80,  213,  277,  354,  411,  434,  456,  472,  506,  

In [12]:
print(data)

DataBatch(edge_index=[2, 17418], x=[4766, 3], y=[128], batch=[4766], ptr=[129])


Play with the dataset from DGL

In [13]:
from grabnel.src.attack.data import Data


In [16]:
dataset_DGL = Data("PROTEINS")
dataset_c_loader = dataset_DGL.adversarial_dataloaders()[1]

  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1113,) + inhomogeneous part.

In [6]:
from dgl.data import MiniGCDataset, TUDataset

In [25]:
dataset_DGL = TUDataset("PROTEINS")

TypeError: TUDataset.__init__() got an unexpected keyword argument 'device'

In [8]:
dataset_DGL.__dict__

{'_name': 'PROTEINS',
 '_url': 'https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip',
 '_force_reload': False,
 '_verbose': False,
 '_hash_key': (),
 '_hash': 'f9065fa7',
 '_transform': None,
 '_raw_dir': '/nfs/homedirs/yzi/.dgl',
 '_save_dir': '/nfs/homedirs/yzi/.dgl',
 'graph_lists': [Graph(num_nodes=42, num_edges=162,
        ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.float64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
        edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
  Graph(num_nodes=27, num_edges=92,
        ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.float64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
        edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
  Graph(num_nodes=10, num_edges=34,
        ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.

In [9]:
graphs_DGL = dataset_DGL.graph_lists

In [10]:
type(graphs_DGL[0])

dgl.heterograph.DGLGraph

In [11]:
graph_sample_DGL = graphs_DGL[0]
print(graph_sample_DGL.__dict__)

{'_graph': <dgl.heterograph_index.HeteroGraphIndex object at 0x7f4dfc2ca6c0>, '_canonical_etypes': [('_N', '_E', '_N')], '_batch_num_nodes': None, '_batch_num_edges': None, '_ntypes': ['_N'], '_is_unibipartite': False, '_srctypes_invmap': {'_N': 0}, '_dsttypes_invmap': {'_N': 0}, '_etypes': ['_E'], '_etype2canonical': {'_E': ('_N', '_E', '_N')}, '_etypes_invmap': {('_N', '_E', '_N'): 0}, '_node_frames': [{'_ID': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41]), 'node_attr': tensor([[23.],
        [10.],
        [25.],
        [ 7.],
        [12.],
        [11.],
        [ 5.],
        [ 7.],
        [ 9.],
        [ 3.],
        [ 6.],
        [22.],
        [ 8.],
        [26.],
        [ 7.],
        [12.],
        [11.],
        [ 5.],
        [ 7.],
        [ 8.],
        [ 3.],
        [ 6.],
        [ 3.],
        [ 9.],
        [10.],
  

In [12]:
from torch.utils.data import DataLoader
from dgl import add_self_loop
import dgl

def collate(samples, add_selfloops=True):
    """Used to create DGL dataloaders."""
    graphs, labels = map(list, zip(*samples))
    if add_selfloops:
        graphs = [add_self_loop(graph) for graph in graphs]
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)

In [13]:
dataset_DGL.__dict__

{'_name': 'PROTEINS',
 '_url': 'https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip',
 '_force_reload': False,
 '_verbose': False,
 '_hash_key': (),
 '_hash': 'f9065fa7',
 '_transform': None,
 '_raw_dir': '/nfs/homedirs/yzi/.dgl',
 '_save_dir': '/nfs/homedirs/yzi/.dgl',
 'graph_lists': [Graph(num_nodes=42, num_edges=162,
        ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.float64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
        edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
  Graph(num_nodes=27, num_edges=92,
        ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.float64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
        edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
  Graph(num_nodes=10, num_edges=34,
        ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.

In [24]:
dataloader_DGL = DataLoader(dataset_DGL, batch_size=128, shuffle=True, collate_fn=collate)

TypeError: DataLoader.__init__() got an unexpected keyword argument 'device'

In [15]:
for i, (graphs, labels) in enumerate(dataloader_DGL):
    print(i)
    print(graphs)
    print(labels)
#     print('type(data.x): ',type(data.x))
#     print('type(data.edge_index): ',type(data.edge_index))
#     print('type(data.batch): ',type(data.batch))
    
    
    break

0
Graph(num_nodes=5303, num_edges=24913,
      ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.float64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
tensor([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
        1, 1, 0, 0, 0, 1, 1, 1])


In [16]:
type(graphs)

dgl.heterograph.DGLGraph

In [17]:
len(graphs.ndata['node_attr'])

5303

In [18]:
type(graphs.ndata['node_attr'])

torch.Tensor

Now try to forward GraphCL with this type of data

In [19]:
from PyGCL.examples.GraphCL_dgl import GConv, Encoder

In [20]:
gconv = GConv(input_dim=1, hidden_dim=32, num_layers=3).to(device)

In [26]:
z, g = gconv(graphs.to(device))

DGLError: [23:03:56] /opt/dgl/src/runtime/c_runtime_api.cc:82: Check failed: allow_missing: Device API cuda is not enabled. Please install the cuda version of dgl.
Stack trace:
  [bt] (0) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x75) [0x7f4e44511e55]
  [bt] (1) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::DeviceAPIManager::GetAPI(std::string, bool)+0x1f2) [0x7f4e448915f2]
  [bt] (2) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::DeviceAPI::Get(DGLContext, bool)+0x1e1) [0x7f4e4488bba1]
  [bt] (3) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Empty(std::vector<long, std::allocator<long> >, DGLDataType, DGLContext)+0x13b) [0x7f4e448aeacb]
  [bt] (4) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DGLContext const&) const+0xc3) [0x7f4e448e8e23]
  [bt] (5) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dgl::UnitGraph::CSR::CopyTo(DGLContext const&) const+0x288) [0x7f4e44a05868]
  [bt] (6) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DGLContext const&)+0xd1) [0x7f4e449f6481]
  [bt] (7) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DGLContext const&)+0xf6) [0x7f4e448fa286]
  [bt] (8) /nfs/homedirs/yzi/miniconda3/envs/lsml/lib/python3.10/site-packages/dgl/libdgl.so(+0x52cbb6) [0x7f4e44909bb6]



In [23]:
graphs.device

device(type='cpu')