In [5]:
from NeuroGraph.datasets import NeuroGraphDataset

## Loading Benchmark Dataset

NeuroGraph provides two classes for loading static and dynamic benchmark datastes. For this project we have decided to analyze the static ones, specifically we are targeting to improve scores obtained for the HPC-age.

In [6]:
dataset_gender = NeuroGraphDataset(root="data/", name= "HCPGender")
#dataset_task = NeuroGraphDataset(root="data/", name= "HCPActivity")
#dataset_age = NeuroGraphDataset(root="data/", name= "HCPAge")

In [3]:
import torch_geometric
#2.0.4
print(torch_geometric.__version__)
#!pip install torch-geometric==2.1.0

2.5.0


In [6]:
print("Gender")
print(dataset_gender.num_classes)
print(dataset_gender.num_features)

print("Activity")
print(dataset_task.num_classes)
print(dataset_task.num_features)

print("Age")
print(dataset_age.num_classes)
print(dataset_age.num_features)

Gender
2
1000
Activity
7
400
Age
3
1000


In [None]:
#for dynamic datasets we can use 
#data_obj = NeuroGraphDynamic(root="data/", name= "DynHCPGender")
#dataset = data_obj.dataset
#labels = data_obj.labels

In [7]:
print("Gender")
print(dataset_gender.num_classes)
print(dataset_gender.num_features)

Gender
2
1000


## Preprocessing

Here we follow NeuroGraph to preprocess the data to construct functional connectomes and generate corresponding graphs-based representations.

In [None]:
from NeuroGraph import utils

# fmri and regs could be numpy arrays
fc = utils.preprocess(fmri, regs, n_rois= 1000)

The corresponding Adjacency matrix and PyG data objects can be created from the functional_connectome as follows.

In [None]:
from NeuroGraph import utils

adj = utils.construct_adj(fc, threshold= 5) # construct the adjacency matrix
data = utils.construct_data(fc, label= 1,threshold = 5) # construct PyG data object

NG use correlation as node features while constructing data object from functional connectome.

The following is the source code for processing one fMRI scan with corresponding regressor using NG preprocessing pipeline.

In [None]:
from NeuroGraph import utils
import numpy as np
from nilearn.image import load_img

img = load_img("data/raw/1.nii.gz") # 1.nii.gz is fMRI scan
regs = np.loadtxt("data/raw/1.txt") # 1.txt is the movement regressor

fmri = img.get_fdata()
fc = utils.preprocess(fmri, regs, n_rois= 100)
adj = utils.construct_adj(fc, threshold= 5) # construct the adjacency matrix
data = utils.construct_data(fc, label = 1,threshold = 5) # construct torch Data object

## NG Model

In [24]:
import torch
from torch.nn import Linear
from torch import nn
from torch_geometric.nn import global_max_pool
from torch_geometric.nn import aggr
import torch.nn.functional as F
from torch_geometric.nn import APPNP, MLP, GCNConv, GINConv, SAGEConv, GraphConv, TransformerConv, ChebConv, GATConv, SGConv, GeneralConv
from torch.nn import Conv1d, MaxPool1d, ModuleList
import random
import math
softmax = torch.nn.LogSoftmax(dim=1)


class ResidualGNNs(torch.nn.Module):
    def __init__(self,args, train_dataset, hidden_channels,hidden, num_layers, GNN, k=0.6):
        super().__init__()
        self.convs = ModuleList()
        self.aggr = aggr.MeanAggregation()
        self.hidden_channels = hidden_channels
        num_features = train_dataset.num_features
        if args['model']=="ChebConv":
            if num_layers>0:
                self.convs.append(GNN(num_features, hidden_channels,K=5))
                for i in range(0, num_layers - 1):
                    self.convs.append(GNN(hidden_channels, hidden_channels,K=5))
        else:
            if num_layers>0:
                self.convs.append(GNN(num_features, hidden_channels))
                for i in range(0, num_layers - 1):
                    self.convs.append(GNN(hidden_channels, hidden_channels))
        
        input_dim1 = int(((num_features * num_features)/2)- (num_features/2)+(hidden_channels*num_layers))
        input_dim = int(((num_features * num_features)/2)- (num_features/2))
        self.bn = nn.BatchNorm1d(input_dim)
        self.bnh = nn.BatchNorm1d(hidden_channels*num_layers)
        # self.attention = Attention(input_dim1, hidden_channels)
        self.mlp = nn.Sequential(
            nn.Linear(input_dim1, hidden),
            nn.BatchNorm1d(hidden),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden, hidden//2),
            nn.BatchNorm1d(hidden//2),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden//2, hidden//2),
            nn.BatchNorm1d(hidden//2),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear((hidden//2), args['num_classes']),
        )

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        # for conv in self.convs:
        #     x = conv(x, edge_index).relu()
        
        xs = [x]        
        for conv in self.convs:
            xs += [conv(xs[-1], edge_index).tanh()]
        h = []
        for i, xx in enumerate(xs):
            if i== 0:
                xx = xx.reshape(data.num_graphs, x.shape[1],-1)
                x = torch.stack([t.triu().flatten()[t.triu().flatten().nonzero(as_tuple=True)] for t in xx])
                x = self.bn(x)
            else:
                # xx = xx.reshape(data.num_graphs, x.shape[1],-1)
                xx = self.aggr(xx,batch)
                # h.append(torch.stack([t.flatten() for t in xx]))
                h.append(xx)
        
        h = torch.cat(h,dim=1)
        h = self.bnh(h)
        # x = torch.stack(h, dim=0)
        x = torch.cat((x,h),dim=1)
        x = self.mlp(x)
        return x

## Applying NG

In [9]:
from NeuroGraph.datasets import NeuroGraphDataset
import argparse
import torch
import torch.nn.functional as F
from torch.optim import Adam
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
import os,random
import os.path as osp
import sys
import time
from utils import *

In [10]:
args={'dataset':'HCPGender',
      'runs':1,
      'device':'cuda',
      'seed':123,
      'model':"GCNConv",
      'hidden':32,
      'hidden_mlp':64,
      'num_layers':3,
      'epochs':100,
      'echo_epoch':50,
      'batch_size':16,
      'early_stopping':50,
      'lr':1e-5,
      'weight_decay':0.0005,
      'dropout':0.5}

In [11]:
path = "base_params/"
res_path = "results/"
root = "data/"

In [12]:
if not os.path.isdir(path):
    os.mkdir(path)
if not os.path.isdir(res_path):
    os.mkdir(res_path)

In [13]:
def logger(info):
    f = open(os.path.join(res_path, 'results_new.csv'), 'a')
    print(info, file=f)

In [14]:
# fix seed
torch.manual_seed(args['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed(args['seed'])
random.seed(args['seed'])
np.random.seed(args['seed'])

In [15]:
dataset=dataset_gender
#dataset = NeuroGraphDataset(root=root, name= args.dataset)
print(dataset.num_classes)
print(len(dataset))
print(dataset.num_features)

2
1078
1000


In [16]:
labels = [d.y.item() for d in dataset]
train_tmp, test_indices = train_test_split(list(range(len(labels))),
                        test_size=0.2, stratify=labels,random_state=123,shuffle= True)
tmp = dataset[train_tmp]
train_labels = [d.y.item() for d in tmp]

In [17]:
train_indices, val_indices = train_test_split(list(range(len(train_labels))),
 test_size=0.125, stratify=train_labels,random_state=123,shuffle = True)
train_dataset = tmp[train_indices]
val_dataset = tmp[val_indices]
test_dataset = dataset[test_indices]

In [18]:
print("dataset {} loaded with train {} val {} test {} splits".format(args['dataset'],len(train_dataset), len(val_dataset), len(test_dataset)))

dataset HCPGender loaded with train 754 val 108 test 216 splits


In [19]:
train_loader = DataLoader(train_dataset, args['batch_size'], shuffle=False)
val_loader = DataLoader(val_dataset, args['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, args['batch_size'], shuffle=False)
args['num_features'],args['num_classes']= dataset.num_features,dataset.num_classes

In [20]:
criterion = torch.nn.CrossEntropyLoss()

def train(train_loader):
    model.train()
    total_loss = 0
    for data in train_loader:  
        data = data.to(args['device'])
        out = model(data)  # Perform a single forward pass.
        loss = criterion(out, data.y) 
        total_loss +=loss
        loss.backward()
        optimizer.step() 
        optimizer.zero_grad()
    return total_loss/len(train_loader.dataset)

In [21]:
def test(loader):
    model.eval()
    correct = 0
    for data in loader:  
        data = data.to(args['device'])
        out = model(data)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  


In [22]:
val_acc_history, test_acc_history, test_loss_history = [],[],[]
seeds = [123,124]

In [25]:
for index in range(args['runs']):
    start = time.time()
    torch.manual_seed(seeds[index])
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seeds[index])
    random.seed(seeds[index])
    np.random.seed(seeds[index])
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    gnn = eval(args['model'])
    model = ResidualGNNs(args,train_dataset,args['hidden'],args['hidden_mlp'],args['num_layers'],gnn).to(args['device']) ## apply GNN*
    print(model)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters is: {total_params}")
    # model.reset_parameters()
    optimizer = Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])
    loss, test_acc = [],[]
    best_val_acc,best_val_loss = 0.0,0.0
    for epoch in range(args['epochs']):
        loss = train(train_loader)
        val_acc = test(val_loader)
        test_acc = test(test_loader)
        # if epoch%10==0:
        print("epoch: {}, loss: {}, val_acc:{}, test_acc:{}".format(epoch, np.round(loss.item(),6), np.round(val_acc,2),np.round(test_acc,2)))
        val_acc_history.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            if epoch> int(args['epochs']/2):
                torch.save(model.state_dict(), path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl')
    
    #test the model   
    model.load_state_dict(torch.load(path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl'))
    model.eval()
    test_acc = test(test_loader)
    test_loss = train(test_loader).item()
    test_acc_history.append(test_acc)
    test_loss_history.append(test_loss)
        

ResidualGNNs(
  (convs): ModuleList(
    (0): GCNConv(1000, 32)
    (1-2): 2 x GCNConv(32, 32)
  )
  (aggr): MeanAggregation()
  (bn): BatchNorm1d(499500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bnh): BatchNorm1d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (mlp): Sequential(
    (0): Linear(in_features=499596, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=32, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.5, inplace=False)
    (12): Linear(in_features=32, out_features=2, bias

KeyboardInterrupt: 

In [55]:

#test the model   
model.load_state_dict(torch.load(path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl'))
model.eval()
test_acc = test(test_loader)
test_loss = train(test_loader).item()
test_acc_history.append(test_acc)
test_loss_history.append(test_loss)
print(test_acc_history, test_loss_history)

[0.8796296296296297] [0.03500416502356529]


Metrics obtained:
- Age: 0.5
- Gender 0.88

# EXPHORMER

In [None]:
# Running Exphormer for LRGB Datasets
%run main_exphormer.py --cfg configs/Exphormer_LRGB/peptides-struct-EX.yaml  wandb.use False

In [None]:
# Running Exphormer for Cifar10
%run main_exphormer.py --cfg configs/Exphormer/cifar10.yaml  wandb.use False

In [14]:
import torch_geometric as tg

print(torch.__version__)
#2.1.0

1.10.2


In [48]:
xform_args={
    'node_encoder':True,
    'node_encoder_name': 'LinearNode+EquivStableLapPE',
    'dim_inner':40,
    'node_encoder_bn':False,
    'edge_encoder':True,
    'dim_edge':16,
    'layer_type':'CustomGatedGCN+Exphormer',
    'edge_encoder_name':'LinearEdge',
    'dim_pe':cfg.posenc_ERE.dim_pe,
'layers_pre_mp':0,
'dim_hidden':40,
'layers':5,
'n_heads':4,
'pna_degrees':cfg.gt.pna_degrees,
'enable':True,
'dropout':0.1,
'attn_dropout':0.1,
'layer_norm':False,
'batch_norm':True,
'bigbird':cfg.gt.bigbird,
'head':'default'

}

In [49]:
def new_optimizer_config(cfg):
    return OptimizerConfig(optimizer=cfg.optim.optimizer,
                           base_lr=cfg.optim.base_lr,
                           weight_decay=cfg.optim.weight_decay,
                           momentum=cfg.optim.momentum)


def new_scheduler_config(cfg):
    return ExtendedSchedulerConfig(
        scheduler=cfg.optim.scheduler,
        steps=cfg.optim.steps, lr_decay=cfg.optim.lr_decay,
        max_epoch=cfg.optim.max_epoch, reduce_factor=cfg.optim.reduce_factor,
        schedule_patience=cfg.optim.schedule_patience, min_lr=cfg.optim.min_lr,
        num_warmup_epochs=cfg.optim.num_warmup_epochs,
        train_mode=cfg.train.mode, eval_period=cfg.train.eval_period)

In [36]:
import datetime
import os
import torch
import logging

import graphgps  # noqa, register custom modules
from graphgps.optimizer.extra_optimizers import ExtendedSchedulerConfig

from torch_geometric.graphgym.cmd_args import parse_args
#from torch_geometric.graphgym.config import (cfg, dump_cfg,
#                                             set_agg_dir, set_cfg, load_cfg,
#                                             makedirs_rm_exist)
from torch_geometric.graphgym.config import (cfg, dump_cfg,
                                             set_cfg, load_cfg,
                                             makedirs_rm_exist)
from torch_geometric.graphgym.loader import create_loader
from torch_geometric.graphgym.logger import set_printing
from torch_geometric.graphgym.optim import create_optimizer, \
    create_scheduler, OptimizerConfig
from torch_geometric.graphgym.model_builder import create_model
from torch_geometric.graphgym.train import train
from torch_geometric.graphgym.utils.agg_runs import agg_runs
from torch_geometric.graphgym.utils.comp_budget import params_count
from torch_geometric.graphgym.utils.device import auto_select_device
from torch_geometric.graphgym.register import train_dict
from torch_geometric import seed_everything

from graphgps.finetuning import load_pretrained_model_cfg, \
    init_model_from_pretrained
from graphgps.logger import create_logger



In [22]:
import torch
import torch_geometric.graphgym.register as register
from torch_geometric.graphgym.config import cfg
from torch_geometric.graphgym.models.gnn import GNNPreMP
from torch_geometric.graphgym.models.layer import (new_layer_config,
                                                   BatchNorm1dNode)
from torch_geometric.graphgym.register import register_network
from graphgps.encoder.ER_edge_encoder import EREdgeEncoder

from graphgps.layer.gps_layer import GPSLayer


class FeatureEncoder(torch.nn.Module):
    """
    Encoding node and edge features

    Args:
        dim_in (int): Input feature dimension
    """
    def __init__(self, dim_in):
        super(FeatureEncoder, self).__init__()
        self.dim_in = dim_in
        if cfg.dataset.node_encoder:
            # Encode integer node features via nn.Embeddings
            NodeEncoder = register.node_encoder_dict[
                cfg.dataset.node_encoder_name]
            self.node_encoder = NodeEncoder(cfg.gnn.dim_inner)
            if cfg.dataset.node_encoder_bn:
                self.node_encoder_bn = BatchNorm1dNode(
                    new_layer_config(cfg.gnn.dim_inner, -1, -1, has_act=False,
                                     has_bias=False, cfg=cfg))
            # Update dim_in to reflect the new dimension fo the node features
            self.dim_in = cfg.gnn.dim_inner
        if cfg.dataset.edge_encoder:
            # Hard-set edge dim for PNA.
            cfg.gnn.dim_edge = 16 if 'PNA' in cfg.gt.layer_type else cfg.gnn.dim_inner
            if cfg.dataset.edge_encoder_name == 'ER':
                self.edge_encoder = EREdgeEncoder(cfg.gnn.dim_edge)
            elif cfg.dataset.edge_encoder_name.endswith('+ER'):
                EdgeEncoder = register.edge_encoder_dict[
                    cfg.dataset.edge_encoder_name[:-3]]
                self.edge_encoder = EdgeEncoder(cfg.gnn.dim_edge - cfg.posenc_ERE.dim_pe)
                self.edge_encoder_er = EREdgeEncoder(cfg.posenc_ERE.dim_pe, use_edge_attr=True)
            else:
                EdgeEncoder = register.edge_encoder_dict[
                    cfg.dataset.edge_encoder_name]
                self.edge_encoder = EdgeEncoder(cfg.gnn.dim_edge)

            if cfg.dataset.edge_encoder_bn:
                self.edge_encoder_bn = BatchNorm1dNode(
                    new_layer_config(cfg.gnn.dim_edge, -1, -1, has_act=False,
                                    has_bias=False, cfg=cfg))

    def forward(self, batch):
        for module in self.children():
            batch = module(batch)
        return batch


@register_network('GPSModel')
class GPSModel(torch.nn.Module):
    """Multi-scale graph x-former.
    """

    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.encoder = FeatureEncoder(dim_in)
        dim_in = self.encoder.dim_in

        if cfg.gnn.layers_pre_mp > 0:
            self.pre_mp = GNNPreMP(
                dim_in, cfg.gnn.dim_inner, cfg.gnn.layers_pre_mp)
            dim_in = cfg.gnn.dim_inner

        assert cfg.gt.dim_hidden == cfg.gnn.dim_inner == dim_in, \
            "The inner and hidden dims must match."

        try:
            local_gnn_type, global_model_type = cfg.gt.layer_type.split('+')
        except:
            raise ValueError(f"Unexpected layer type: {cfg.gt.layer_type}")
        layers = []
        for _ in range(cfg.gt.layers):
            layers.append(GPSLayer(
                dim_h=cfg.gt.dim_hidden,
                local_gnn_type=local_gnn_type,
                global_model_type=global_model_type,
                num_heads=cfg.gt.n_heads,
                pna_degrees=cfg.gt.pna_degrees,
                equivstable_pe=cfg.posenc_EquivStableLapPE.enable,
                dropout=cfg.gt.dropout,
                attn_dropout=cfg.gt.attn_dropout,
                layer_norm=cfg.gt.layer_norm,
                batch_norm=cfg.gt.batch_norm,
                bigbird_cfg=cfg.gt.bigbird,
            ))
        self.layers = torch.nn.Sequential(*layers)

        GNNHead = register.head_dict[cfg.gnn.head]
        self.post_mp = GNNHead(dim_in=cfg.gnn.dim_inner, dim_out=dim_out)

    def forward(self, batch):
        for module in self.children():
            batch = module(batch)
        return batch

In [50]:
#NEW 
class FeatureEncoder(torch.nn.Module):
    def __init__(self, dim_in):
        super(FeatureEncoder, self).__init__()
        self.dim_in = dim_in
        if xform_args['node_encoder']:
            # Encode integer node features via nn.Embeddings
            NodeEncoder = register.node_encoder_dict[
                xform_args['node_encoder_name']]
            self.node_encoder = NodeEncoder(xform_args['dim_inner'])
            if xform_args['node_encoder_bn']:
                self.node_encoder_bn = BatchNorm1dNode(
                    new_layer_config(xform_args['dim_inner'], -1, -1, has_act=False,
                                     has_bias=False, cfg=cfg))
            # Update dim_in to reflect the new dimension fo the node features
            self.dim_in = xform_args['dim_inner']
        if xform_args['edge_encoder']:
            # Hard-set edge dim for PNA.
            xform_args['dim_edge'] = 16 if 'PNA' in xform_args['layer_type'] else xform_args['dim_inner']
            if xform_args['edge_encoder_name'] == 'ER':
                self.edge_encoder = EREdgeEncoder(xform_args['dim_edge'])
            elif xform_args['edge_encoder_name'].endswith('+ER'):
                EdgeEncoder = register.edge_encoder_dict[
                    xform_args['edge_encoder_name'][:-3]]
                self.edge_encoder = EdgeEncoder(xform_args['dim_edge'] - xform_args['dim_pe'])
                self.edge_encoder_er = EREdgeEncoder(xform_args['dim_pe'], use_edge_attr=True)
            else:
                EdgeEncoder = register.edge_encoder_dict[
                    xform_args['edge_encoder_name']]
                self.edge_encoder = EdgeEncoder(xform_args['dim_edge'])

            if xform_args['edge_encoder_bn']:
                self.edge_encoder_bn = BatchNorm1dNode(
                    new_layer_config(xform_args['dim_edge'], -1, -1, has_act=False,
                                    has_bias=False, cfg=cfg))

    def forward(self, batch):
        for module in self.children():
            batch = module(batch)
        return batch


class GPSModel(torch.nn.Module):
    """Multi-scale graph x-former.
    """

    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.encoder = FeatureEncoder(dim_in)
        dim_in = self.encoder.dim_in

        if cfg.gnn.layers_pre_mp > 0:
            self.pre_mp = GNNPreMP(
                dim_in, xform_args['dim_inner'], xform_args['layers_pre_mp'])
            dim_in = xform_args['dim_inner']

        assert xform_args['dim_hidden'] == xform_args['dim_inner'] == dim_in, \
            "The inner and hidden dims must match."

        try:
            local_gnn_type, global_model_type = xform_args['layer_type'].split('+')
        except:
            raise ValueError(f"Unexpected layer type: {xform_args['layer_type']}")
        layers = []
        for _ in range(xform_args['layers']):
            layers.append(GPSLayer(
                dim_h=xform_args['dim_hidden'],
                local_gnn_type=local_gnn_type,
                global_model_type=global_model_type,
                num_heads=xform_args['n_heads'],
                pna_degrees=xform_args['pna_degrees'],
                equivstable_pe=xform_args['enable'],
                dropout=xform_args['dropout'],
                attn_dropout=xform_args['attn_dropout'],
                layer_norm=xform_args['layer_norm'],
                batch_norm=xform_args['batch_norm'],
                bigbird_cfg=xform_args['bigbird'],
            ))
        self.layers = torch.nn.Sequential(*layers)

        GNNHead = register.head_dict[xform_args['head']]
        self.post_mp = GNNHead(dim_in=xform_args['dim_inner'], dim_out=dim_out)

    def forward(self, batch):
        for module in self.children():
            batch = module(batch)
        return batch

In [37]:
def custom_set_out_dir(cfg, cfg_fname, name_tag):
    """Set custom main output directory path to cfg.
    Include the config filename and name_tag in the new :obj:`cfg.out_dir`.

    Args:
        cfg (CfgNode): Configuration node
        cfg_fname (string): Filename for the yaml format configuration file
        name_tag (string): Additional name tag to identify this execution of the
            configuration file, specified in :obj:`cfg.name_tag`
    """
    run_name = os.path.splitext(os.path.basename(cfg_fname))[0]
    run_name += f"-{name_tag}" if name_tag else ""
    cfg.out_dir = os.path.join(cfg.out_dir, run_name)


def custom_set_run_dir(cfg, run_id):
    """Custom output directory naming for each experiment run.

    Args:
        cfg (CfgNode): Configuration node
        run_id (int): Main for-loop iter id (the random seed or dataset split)
    """
    cfg.run_dir = os.path.join(cfg.out_dir, str(run_id))
    # Make output directory
    if cfg.train.auto_resume:
        os.makedirs(cfg.run_dir, exist_ok=True)
    else:
        makedirs_rm_exist(cfg.run_dir)

In [51]:
eval(args['model'])

torch_geometric.nn.conv.gcn_conv.GCNConv

In [None]:
for index in range(args['runs']):
    start = time.time()
    torch.manual_seed(seeds[index])
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seeds[index])
    random.seed(seeds[index])
    np.random.seed(seeds[index])
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    gnn = eval(args['model'])
    
    model = GPSModel(args,train_dataset,args['hidden'],args['hidden_mlp'],args['num_layers'],gnn).to(args['device']) ## apply GNN*
    print(model)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters is: {total_params}")
    # model.reset_parameters()
    optimizer = Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])
    loss, test_acc = [],[]
    best_val_acc,best_val_loss = 0.0,0.0
    for epoch in range(args['epochs']):
        loss = train(train_loader)
        val_acc = test(val_loader)
        test_acc = test(test_loader)
        # if epoch%10==0:
        print("epoch: {}, loss: {}, val_acc:{}, test_acc:{}".format(epoch, np.round(loss.item(),6), np.round(val_acc,2),np.round(test_acc,2)))
        val_acc_history.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            if epoch> int(args['epochs']/2):
                torch.save(model.state_dict(), path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl')
    
    #test the model   
    model.load_state_dict(torch.load(path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl'))
    model.eval()
    test_acc = test(test_loader)
    test_loss = train(test_loader).item()
    test_acc_history.append(test_acc)
    test_loss_history.append(test_loss)
        

In [None]:
# Load cmd line args
    args = parse_args()
    # Load config file
    set_cfg(cfg)
    load_cfg(cfg, args)
    custom_set_out_dir(cfg, args.cfg_file, cfg.name_tag)
    dump_cfg(cfg)
    # Set Pytorch environment
    torch.set_num_threads(cfg.num_threads)
    # Repeat for multiple experiment runs
    for run_id, seed, split_index in zip(*run_loop_settings()):
        # Set configurations for each run
        custom_set_run_dir(cfg, run_id)
        set_printing()
        cfg.dataset.split_index = split_index
        cfg.seed = seed
        cfg.run_id = run_id
        seed_everything(cfg.seed)
        auto_select_device()
        if cfg.pretrained.dir:
            cfg = load_pretrained_model_cfg(cfg)
        logging.info(f"[*] Run ID {run_id}: seed={cfg.seed}, "
                     f"split_index={cfg.dataset.split_index}")
        logging.info(f"    Starting now: {datetime.datetime.now()}")
        # Set machine learning pipeline
        loaders = create_loader()
        loggers = create_logger()
        # custom_train expects three loggers for 'train', 'valid' and 'test'.
        # GraphGym code creates one logger/loader for each of the 'train_mask' etc.
        # attributes in the dataset. As a work around it, we create one logger for each
        # of the types.
        # loaders are a const, so it is ok to just duplicate the loader. 
        if cfg.dataset.name == 'ogbn-arxiv' or cfg.dataset.name == 'ogbn-proteins':
            loggers_2 = create_logger()
            loggers_3 = create_logger()
            loggers_2[0].name = "val"
            loggers_3[0].name = "test"
            loggers.extend(loggers_2)
            loggers.extend(loggers_3)
            loaders = loaders*3
        model = create_model()
        if cfg.pretrained.dir:
            model = init_model_from_pretrained(
                model, cfg.pretrained.dir, cfg.pretrained.freeze_main,
                cfg.pretrained.reset_prediction_head
            )
        optimizer = create_optimizer(model.parameters(),
                                     new_optimizer_config(cfg))
        scheduler = create_scheduler(optimizer, new_scheduler_config(cfg))
        # Print model info
        logging.info(model)
        logging.info(cfg)
        cfg.params = params_count(model)
        logging.info('Num parameters: %s', cfg.params)
        # Start training
        if cfg.train.mode == 'standard':
            if cfg.wandb.use:
                logging.warning("[W] WandB logging is not supported with the "
                                "default train.mode, set it to `custom`")
            train(loggers, loaders, model, optimizer, scheduler)
        else:
            train_dict[cfg.train.mode](loggers, loaders, model, optimizer,
                                       scheduler)
    # Aggregate results from different seeds
    try:
        agg_runs(cfg.out_dir, cfg.metric_best)
    except Exception as e:
        logging.info(f"Failed when trying to aggregate multiple runs: {e}")
    # When being launched in batch mode, mark a yaml as done
    if args.mark_done:
        os.rename(args.cfg_file, f'{args.cfg_file}_done')
    logging.info(f"[*] All done: {datetime.datetime.now()}")


# Other - BigBird

## Load Data and SetUp

In [5]:
import argparse
import torch
import torch.nn.functional as F
from torch.optim import Adam
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
import os,random
import os.path as osp
import sys
import time
from utils import *

In [3]:
#!pip install torch torch_geometric==2.1.0

In [6]:
from NeuroGraph.datasets import NeuroGraphDataset
dataset = NeuroGraphDataset(root="data/", name= "HCPGender")

Processing...


AttributeError: Can't get attribute 'DataEdgeAttr' on <module 'torch_geometric.data.data' from 'C:\\Users\\jmlr9\\.conda\\envs\\exphormer2\\lib\\site-packages\\torch_geometric\\data\\data.py'>

In [5]:
print("Gender")
print(dataset.num_classes)
print(dataset.num_features)

Gender
2
1000


In [6]:
path = "base_params/"
res_path = "results/"
root = "data/"

In [7]:
if not os.path.isdir(path):
    os.mkdir(path)
if not os.path.isdir(res_path):
    os.mkdir(res_path)

In [8]:
def logger(info):
    f = open(os.path.join(res_path, 'results_new.csv'), 'a')
    print(info, file=f)

In [9]:
# fix seed
seed_fixed=123
torch.manual_seed(seed_fixed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_fixed)
random.seed(seed_fixed)
np.random.seed(seed_fixed)

In [10]:
labels = [d.y.item() for d in dataset]
train_tmp, test_indices = train_test_split(list(range(len(labels))),
                        test_size=0.2, stratify=labels,random_state=123,shuffle= True)
tmp = dataset[train_tmp]
train_labels = [d.y.item() for d in tmp]

In [11]:
train_indices, val_indices = train_test_split(list(range(len(train_labels))),
 test_size=0.125, stratify=train_labels,random_state=123,shuffle = True)
train_dataset = tmp[train_indices]
val_dataset = tmp[val_indices]
test_dataset = dataset[test_indices]

In [12]:
args = {
    'dataset': 'HCPGender',
    'runs': 1,
    'device': 'cuda',
    'seed': seed_fixed,
    'model': 'BigBird',  # Change from 'GCNConv' to 'BigBird'
       'num_layers': 3,  # Number of layers in the model
    'epochs': 15,  # Number of training epochs
    'echo_epoch': 3,  # Print epoch every 50 epochs
    'batch_size': 16,  # Batch size for training
    'early_stopping': 5,  # Patience for early stopping
    'lr': 1e-5,  # Learning rate
    'weight_decay': 0.0005,  # Weight decay
    'dropout': 0.5,  # Dropout probability
}

In [13]:
print("dataset {} loaded with train {} val {} test {} splits".format(args['dataset'],len(train_dataset), len(val_dataset), len(test_dataset)))

dataset HCPGender loaded with train 754 val 108 test 216 splits


In [14]:
train_loader = DataLoader(train_dataset, args['batch_size'], shuffle=False)
val_loader = DataLoader(val_dataset, args['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, args['batch_size'], shuffle=False)
args['num_features'],args['num_classes']= dataset.num_features,dataset.num_classes

## BigBird Model

In [15]:
import torch
import torch_geometric.graphgym.register as register
from torch_geometric.graphgym.config import cfg
from torch_geometric.graphgym.models.gnn import FeatureEncoder, GNNPreMP
from torch_geometric.graphgym.register import register_network

from graphgps.layer.bigbird_layer import BigBirdModel as BackboneBigBird

class BigBird(torch.nn.Module):
    """BigBird without edge features.
    This model disregards edge features and runs a linear transformer over a set of node features only.
    BirBird applies random sparse attention to the input sequence - the longer the sequence the closer it is to O(N)
    https://arxiv.org/abs/2007.14062
    """
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.encoder = FeatureEncoder(dim_in)
        dim_in = self.encoder.dim_in

        if cfg.gnn.layers_pre_mp > 0:
            self.pre_mp = GNNPreMP(
                dim_in, cfg.gnn.dim_inner, cfg.gnn.layers_pre_mp)
            dim_in = cfg.gnn.dim_inner

        assert cfg.gt.dim_hidden == cfg.gnn.dim_inner == dim_in, \
            "The inner and hidden dims must match."

        # Copy main Transformer hyperparams to the BigBird config.
        cfg.gt.bigbird.layers = cfg.gt.layers
        cfg.gt.bigbird.n_heads = cfg.gt.n_heads
        cfg.gt.bigbird.dim_hidden = cfg.gt.dim_hidden
        cfg.gt.bigbird.dropout = cfg.gt.dropout
        self.trf = BackboneBigBird(
            config=cfg.gt.bigbird,
        )

        GNNHead = register.head_dict[cfg.gnn.head]
        self.post_mp = GNNHead(dim_in=cfg.gnn.dim_inner, dim_out=dim_out)

    def forward(self, batch):
        for module in self.children():
            batch = module(batch)
        return batch



## Aplying Model

In [18]:
args_bigbird = {
    'dataset': 'HCPGender',
    'runs': 1,
    'device': 'cuda',
    'seed': seed_fixed,
    'model': 'BigBird',  # Change from 'GCNConv' to 'BigBird'
   # 'dim_in': ?,  # Define the appropriate input dimensionality of the features
   # 'dim_out': ?,  # Define the appropriate output dimensionality
    'num_layers': 3,  # Number of layers in the model
    'epochs': 15,  # Number of training epochs
    'echo_epoch': 3,  # Print epoch every 50 epochs
    'batch_size': 16,  # Batch size for training
    'early_stopping': 5,  # Patience for early stopping
    'lr': 1e-5,  # Learning rate
    'weight_decay': 0.0005,  # Weight decay
    'dropout': 0.5,  # Dropout probability
}

num_features = dataset.num_features  # Number of input features in your dataset
num_classes = dataset.num_classes  # Number of classes in your classification task
dim_in = num_features  # Number of input features
dim_out = num_classes 

args=args_bigbird

In [70]:
import argparse
from yacs.config import CfgNode
from torch_geometric.graphgym.config import load_cfg, set_cfg, cfg
import yaml 

# Define the path to your YAML file
cfg_file = 'cifar10.yaml'

# Create an empty argparse Namespace object
class ArgsObject:
    pass

# Convert the YAML file to a dictionary
with open(cfg_file, 'r') as file:
    cfg_dict = yaml.safe_load(file)

# Create a CfgNode object from the dictionary
cfg = CfgNode(cfg_dict)

# Create an ArgsObject instance with the cfg_file attribute
args = ArgsObject()
args.cfg_file = cfg_file

# Load the configurations from the YAML file
set_cfg(cfg)
load_cfg(cfg, args)
dump_cfg(cfg)

KeyError: 'Non-existent config key: train.mode'

In [71]:
cfg.train

CfgNode({'batch_size': 16, 'sampler': 'full_batch', 'sample_node': False, 'node_per_graph': 32, 'radius': 'extend', 'eval_period': 10, 'skip_train_eval': False, 'ckpt_period': 100, 'enable_ckpt': True, 'auto_resume': False, 'epoch_resume': -1, 'ckpt_clean': True, 'iter_per_epoch': 32, 'walk_length': 4, 'neighbor_sizes': [20, 15, 10, 5], 'ckpt_best': False})

In [None]:
criterion = torch.nn.CrossEntropyLoss()

def train(train_loader):
    model.train()
    total_loss = 0
    for data in train_loader:  
        data = data.to(args['device'])
        out = model(data)  # Perform a single forward pass.
        loss = criterion(out, data.y) 
        total_loss +=loss
        loss.backward()
        optimizer.step() 
        optimizer.zero_grad()
    return total_loss/len(train_loader.dataset)

def test(loader):
    model.eval()
    correct = 0
    for data in loader:  
        data = data.to(args['device'])
        out = model(data)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  

In [None]:
val_acc_history, test_acc_history, test_loss_history = [],[],[]
seeds = [123,124]

In [None]:
for index in range(args['runs']):
    start = time.time()
    torch.manual_seed(seeds[index])
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seeds[index])
    random.seed(seeds[index])
    np.random.seed(seeds[index])
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    gnn = eval(args['model'])
    model = ResidualGNNs(args,train_dataset,args['hidden'],args['hidden_mlp'],args['num_layers'],gnn).to(args['device']) ## apply GNN*
    print(model)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters is: {total_params}")
    # model.reset_parameters()
    optimizer = Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])
    loss, test_acc = [],[]
    best_val_acc,best_val_loss = 0.0,0.0
    for epoch in range(args['epochs']):
        loss = train(train_loader)
        val_acc = test(val_loader)
        test_acc = test(test_loader)
        # if epoch%10==0:
        print("epoch: {}, loss: {}, val_acc:{}, test_acc:{}".format(epoch, np.round(loss.item(),6), np.round(val_acc,2),np.round(test_acc,2)))
        val_acc_history.append(val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            if epoch> int(args['epochs']/2):
                torch.save(model.state_dict(), path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl')
    
    #test the model   
    model.load_state_dict(torch.load(path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl'))
    model.eval()
    test_acc = test(test_loader)
    test_loss = train(test_loader).item()
    test_acc_history.append(test_acc)
    test_loss_history.append(test_loss)

In [None]:
#test the model   
model.load_state_dict(torch.load(path + args['dataset']+args['model']+'task-checkpoint-best-acc.pkl'))
model.eval()
test_acc = test(test_loader)
test_loss = train(test_loader).item()
test_acc_history.append(test_acc)
test_loss_history.append(test_loss)
print(test_acc_history, test_loss_history)