In [1]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import Sequential, Linear, ReLU, CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data.dataset import random_split

import GCL.augmentors as A
import GCL.losses as L
from GCL.models import DualBranchContrast

from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.loader import DataLoader
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import dropout_edge, add_random_edge

from tqdm import tqdm
import itertools
import warnings
import sys
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os.path as osp

In [2]:
from gin import *

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
dataset_name = 'PROTEINS'
train_multiple_classifiers = False

# Hyperparams
lrs = [0.01]
num_layers = [3,5]
hidden_dims = [16, 32] # {16, 32} for bio-graphs and 64 for social graphs
dropouts = [0, 0.5]
batch_sizes = [32, 128]
epochs = 20
# print(f'======The hyperparams: lr={lr}, num_layers={num_layers}, epochs={epochs}. On dataset:{dataset_name}======')

device = torch.device('cuda')
path = osp.join(osp.expanduser('~'), 'datasets')
dataset = TUDataset(path, name=dataset_name)

# Split the dataset into two part for training classifier and final evaluation, train_val_set can be further divided into training and validation parts
train_val_set, eval_set = random_split(dataset, [0.9, 0.1])


num_features = max(dataset.num_features, 1)
num_classes = dataset.num_classes
if dataset.num_features==0 :
    print("No node feature, paddings of 1 will be used in GIN when forwarding.")


# Do grid search on hyperparams with K-Fold validation
best_hyperparams = {}
best_acc_val = 0

In [5]:
for lr, num_layer, hidden_dim, dropout, batch_size in itertools.product(lrs, num_layers, hidden_dims, dropouts, batch_sizes):
    # Define model
    encoder_model = GIN(num_features=num_features, dim=hidden_dim, num_gc_layers=num_layer, dropout=dropout).to(device)
    classifier = LogReg(hidden_dim * num_layer, num_classes).to(device)
    model = GCL_classifier(encoder_model, classifier)
    optimizer = Adam(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=50, gamma=0.5)

    # K-Fold
    splitor = StratifiedKFold(n_splits=10, shuffle=True)

    dataset_train_val = dataset[train_val_set.indices] # This is necessary to construct a dataset object
    n_samples = len(dataset_train_val)

    for i, (train_index, val_index) in enumerate(splitor.split(np.zeros(n_samples), dataset_train_val.y.cpu().numpy())):
        # Further split the dataset into train and val set
        dataloader_train = DataLoader(dataset_train_val[train_index], batch_size=batch_size, shuffle=True)
        dataloader_eval = DataLoader(dataset_train_val[val_index], batch_size=batch_size, shuffle=False)
        # Train the model
        with tqdm(total=epochs, desc='(T)') as pbar:
            for epoch in range(1, epochs + 1):
                loss = train(model, dataloader_train, optimizer, scheduler)
                pbar.set_postfix({'K-Fold-loss': loss})
                pbar.update()
        # Get the val accuracy
        acc_val, _ = eval_encoder(model, dataloader_eval, device=device)
        break
    break

(T): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.21it/s, K-Fold-loss=17]


In [15]:
dataset_train_val[val_index].y.shape

torch.Size([101])

In [7]:
print(lr, num_layer, hidden_dim, dropout, batch_size)

0.01 3 16 0 32


In [15]:
lr, num_layer, hidden_dim, dropout, batch_size = (1, 2, 3, 4, 5)

In [16]:
dictionary = {str(var): var for var in [lr, num_layer, hidden_dim, dropout, batch_size]}

In [17]:
dictionary

{'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}

In [20]:
a,b,c,d,e = dictionary.values()

In [21]:
d

4

In [25]:
dataset.__dict__

{'name': 'PROTEINS',
 'cleaned': False,
 'root': '/nfs/homedirs/yzi/datasets',
 'transform': None,
 'pre_transform': None,
 'pre_filter': None,
 'log': True,
 '_indices': None,
 '_data': Data(x=[43471, 3], edge_index=[2, 162088], y=[1113]),
 'slices': {'edge_index': tensor([     0,    162,    254,  ..., 161922, 161940, 162088]),
  'x': tensor([    0,    42,    69,  ..., 43426, 43431, 43471]),
  'y': tensor([   0,    1,    2,  ..., 1111, 1112, 1113])},
 '_data_list': [Data(edge_index=[2, 162], x=[42, 3], y=[1]),
  Data(edge_index=[2, 92], x=[27, 3], y=[1]),
  Data(edge_index=[2, 34], x=[10, 3], y=[1]),
  Data(edge_index=[2, 92], x=[24, 3], y=[1]),
  Data(edge_index=[2, 42], x=[11, 3], y=[1]),
  Data(edge_index=[2, 1632], x=[336, 3], y=[1]),
  Data(edge_index=[2, 486], x=[108, 3], y=[1]),
  Data(edge_index=[2, 562], x=[154, 3], y=[1]),
  Data(edge_index=[2, 66], x=[19, 3], y=[1]),
  Data(edge_index=[2, 38], x=[11, 3], y=[1]),
  Data(edge_index=[2, 64], x=[20, 3], y=[1]),
  Data(edge_inde

In [30]:
dataset[train_set.indices].y.shape

torch.Size([1002])

In [31]:
train_dataset = dataset[train_set.indices]

In [35]:
train_dataset.y.cpu().numpy().shape

(1002,)

In [36]:
len(train_set)

[autoreload of gin failed: Traceback (most recent call last):
  File "/nfs/homedirs/yzi/miniconda3/envs/lsml_new/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 273, in check
    superreload(m, reload, self.old_objects)
  File "/nfs/homedirs/yzi/miniconda3/envs/lsml_new/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 471, in superreload
    module = reload(module)
  File "/nfs/homedirs/yzi/miniconda3/envs/lsml_new/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/nfs/homedirs/yzi/resources/MyGCL/gin.py", line 5, in <module>
    from torch.optim import Adam, StepLR
ImportError: cannot import name 'StepLR' from 'torch.optim' (/nfs/homedirs/yzi/miniconda3/envs/lsml_new/lib/python3.10/sit

1002

In [45]:
tril_indices = torch.tril_indices(row=20, col=20, offset=-1).to(device)

In [49]:
tril_indices.device

device(type='cuda', index=0)

In [51]:
modified_adj = PGDAttacker.get_modified_adj(ori_adj)

cuda:0 cuda:0 cpu


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [None]:
# Standard case
edge_index = torch.tensor([[0, 1, 1, 2, 2, 3],
                           [1, 0, 2, 1, 3, 2]])
edge_index, added_edges = add_random_edge(edge_index, p=1, force_undirected=True)

In [None]:
edge_index

In [None]:
added_edges

In [None]:
classifier = train_classifier(embedding_global, y)

In [None]:
classifier

In [None]:
acc = eval_encoder(encoder_model, classifier, eval_set)

In [None]:
acc

In [None]:
for data in dataloader:
    print(data.__dict__)
    print('type(data.x): ',type(data.x))
    print('type(data.edge_index): ',type(data.edge_index))
    print('type(data.batch): ',type(data.batch))
    
    
    break

First play with the dataset and learn about the data structure

In [None]:
dataloader.__dict__

In [None]:
for data in dataloader:
    print(data.__dict__)
    print('type(data.x): ',type(data.x))
    print('type(data.edge_index): ',type(data.edge_index))
    print('type(data.batch): ',type(data.batch))
    
    
    break


In [None]:
print(data)

Play with the dataset from DGL

In [None]:
from grabnel.src.attack.data import Data


In [None]:
dataset_DGL = Data("PROTEINS")
dataset_c_loader = dataset_DGL.adversarial_dataloaders()[1]

In [None]:
from dgl.data import MiniGCDataset, TUDataset

In [None]:
dataset_DGL = TUDataset("PROTEINS")

In [None]:
dataset_DGL.__dict__

In [None]:
graphs_DGL = dataset_DGL.graph_lists

In [None]:
type(graphs_DGL[0])

In [None]:
graph_sample_DGL = graphs_DGL[0]
print(graph_sample_DGL.__dict__)

In [None]:
from torch.utils.data import DataLoader
from dgl import add_self_loop
import dgl

def collate(samples, add_selfloops=True):
    """Used to create DGL dataloaders."""
    graphs, labels = map(list, zip(*samples))
    if add_selfloops:
        graphs = [add_self_loop(graph) for graph in graphs]
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)

In [None]:
dataset_DGL.__dict__

In [None]:
dataloader_DGL = DataLoader(dataset_DGL, batch_size=128, shuffle=True, collate_fn=collate)

In [None]:
for i, (graphs, labels) in enumerate(dataloader_DGL):
    print(i)
    print(graphs)
    print(labels)
#     print('type(data.x): ',type(data.x))
#     print('type(data.edge_index): ',type(data.edge_index))
#     print('type(data.batch): ',type(data.batch))
    
    
    break

In [None]:
type(graphs)

In [None]:
len(graphs.ndata['node_attr'])

In [None]:
type(graphs.ndata['node_attr'])

Now try to forward GraphCL with this type of data

In [None]:
from PyGCL.examples.GraphCL_dgl import GConv, Encoder

In [None]:
gconv = GConv(input_dim=1, hidden_dim=32, num_layers=3).to(device)

In [None]:
z, g = gconv(graphs.to(device))

In [None]:
graphs.device