In [1]:
import torch, torch_geometric
from torch.nn import functional as F
from torch.nn import ReLU, Tanh,LeakyReLU
from torch_geometric import seed_everything
from torch_geometric.nn import Sequential
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from math import log
from torch_geometric.utils import to_networkx
import networkx as nx
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="True"
#os.environ["CUDA_VISIBLE_DEVICES"]='0'
import numpy as np
import argparse
from matplotlib import pyplot as plt
import pandas as pd
import random
from typing import Optional, Tuple

import torch
from torch import Tensor
from torch.nn import Parameter
from torch_scatter import scatter_add
from torch_sparse import SparseTensor, fill_diag, matmul, mul
from torch_sparse import sum as sparsesum

from torch_geometric.nn.conv import MessagePassing
from torch_geometric.nn.dense.linear import Linear
from torch_geometric.nn.inits import zeros
from torch_geometric.typing import Adj, OptTensor, PairTensor
from torch_geometric.utils import add_remaining_self_loops
from torch_geometric.utils.num_nodes import maybe_num_nodes
from torch_geometric.nn import inits
import time
import math
#import ogb
#from ogb.nodeproppred import PygNodePropPredDataset


In [2]:
from spoginit import SpogInit
from models.GCN_layer import MP
from models.model import MyResGCN

In [3]:
# load data
device = "cuda:0"
dataset = Planetoid(root='./data/', name="Cora")
data = dataset[0].to(device)
train_mask = data.train_mask.to(device)
val_mask = data.val_mask.to(device)
test_mask = data.test_mask.to(device)
train_idx= train_mask
print("train set:", train_mask.sum()/data.x.shape[0])


print("data norm: ", torch.norm(data.x))




input_dim, output_dim = dataset.num_features, dataset.num_classes

train set: tensor(0.0517, device='cuda:0')
data norm:  tensor(221.8468, device='cuda:0')


In [4]:
# set seed functions and prepare model for computing the GEV
import random
import numpy as np

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)


messagepa = MP(cached = True).to(device)
def new_smoothness_gpu(x):
    smooth = torch.zeros(1)
    smooth = smooth.to(device)
    for k in range(x.shape[1]):
        u = x[:,k].to(device)
        if torch.matmul(u.T,u) == 0:
            smooth += 0
        else:
            #smooth += torch.matmul(u.T,torch.matmul(L,u))/torch.matmul(u.T,u)
            smooth += torch.matmul(u.T,u.reshape(-1,1)-messagepa(u.reshape(-1,1),data.edge_index))/torch.matmul(u.T,u)
    final =0.5 * smooth / x.shape[1] 
    return final

In [5]:
import random
import numpy as np

def set_seed():
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
        torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = False
    torch.use_deterministic_algorithms(False)

# function to read the gradients in models
def compute_weight_grad(model):
    z = []
    for a,para in model.named_parameters():
        if "weight" in a:
            #print(a)
            #print(grad.grad)
            b = torch.norm(para.grad,dim=None)
            z.append(b.detach())
    z = torch.tensor(z)
    return z.cpu().detach().numpy()

#  Conventional Initialization

In [6]:
layer_list = [4,8,16,32,64,128] # we use this to control the num of layer in GCNs

for layer in layer_list:
    hidden=64
    set_seed()
    model = MyResGCN(input_dim, hidden, output_dim, layer,initialization="conventional", activation = "ReLU",dropout=0).cuda()
    #uniform here refers to the Conventional initializaton

    # print_all_x aims to returns the output of hidden layers
    out = model.print_all_x(data.x, data.edge_index)
    # out_final aims to produce the output logits
    out_final = model(data.x, data.edge_index)
    loss = F.nll_loss(out_final[data.train_mask], data.y[data.train_mask]).to(device)
    loss.backward()
    
    print(f"for {layer} layers ResGCN Conventional initializations")
    print(f"FSP (output norm/ input norm): {torch.norm(model.print_x(data.x,data.edge_index))/torch.norm(data.x)}")
    print(f"BSP (gradient norm of the first layer): {compute_weight_grad(model)[0]}")
    diri = new_smoothness_gpu(model.print_x(data.x,data.edge_index))
    print(f"GEV: {diri.item()}")
    print(f"\n\n")

for 4 layers ResGCN Conventional initializations
FSP (output norm/ input norm): 0.09187818318605423
BSP (gradient norm of the first layer): 0.2862228453159332
GEV: 0.06229575350880623



for 8 layers ResGCN Conventional initializations
FSP (output norm/ input norm): 0.13157212734222412
BSP (gradient norm of the first layer): 0.3554740250110626
GEV: 0.07253705710172653



for 16 layers ResGCN Conventional initializations
FSP (output norm/ input norm): 0.6739550232887268
BSP (gradient norm of the first layer): 1.5336799621582031
GEV: 0.009775308892130852



for 32 layers ResGCN Conventional initializations
FSP (output norm/ input norm): 42.40602493286133
BSP (gradient norm of the first layer): 120.29623413085938
GEV: 0.00023454584879800677





  if torch.matmul(u.T,u) == 0:


for 64 layers ResGCN Conventional initializations
FSP (output norm/ input norm): 51700.40625
BSP (gradient norm of the first layer): 218216.34375
GEV: 5.516078454093076e-05



for 128 layers ResGCN Conventional initializations
FSP (output norm/ input norm): 176099164160.0
BSP (gradient norm of the first layer): 516017487872.0
GEV: 1.3020836377108935e-05





# Xavier Initialization

In [7]:
layer_list = [4,8,16,32,64,128]

for layer in layer_list:
    hidden=64
    #layers = str(layer)
    hidden_dim_list = [hidden]*(layer-1)
    set_seed()
    model = MyResGCN(input_dim, hidden, output_dim, layer,initialization="glorot", activation = "ReLU",dropout=0).cuda()
    #glorot here refers to the Xavier initializaton

    # print_all_x aims to returns the output of hidden layers
    out = model.print_all_x(data.x, data.edge_index)
    # out_final aims to produce the output logits
    out_final = model(data.x, data.edge_index)
    loss = F.nll_loss(out_final[data.train_mask], data.y[data.train_mask]).to(device)
    loss.backward()
    
    print(f"for {layer} layers ResGCN using Xavier initializations")
    print(f"FSP (output norm/ input norm): {torch.norm(model.print_x(data.x,data.edge_index))/torch.norm(data.x)}")
    print(f"BSP (gradient norm of the first layer): {compute_weight_grad(model)[0]}")
    diri = new_smoothness_gpu(model.print_x(data.x,data.edge_index))
    print(f"GEV: {diri.item()}")
    print(f"\n\n")

for 4 layers ResGCN using Xavier initializations
FSP (output norm/ input norm): 0.07252059131860733
BSP (gradient norm of the first layer): 0.40580928325653076
GEV: 0.0973934456706047



for 8 layers ResGCN using Xavier initializations
FSP (output norm/ input norm): 0.20563116669654846
BSP (gradient norm of the first layer): 0.9548661112785339
GEV: 0.03791920095682144



for 16 layers ResGCN using Xavier initializations
FSP (output norm/ input norm): 7.419041633605957
BSP (gradient norm of the first layer): 116.94780731201172
GEV: 0.0017972534988075495



for 32 layers ResGCN using Xavier initializations
FSP (output norm/ input norm): 879.1549072265625
BSP (gradient norm of the first layer): 10130.623046875
GEV: 0.0002476221416145563



for 64 layers ResGCN using Xavier initializations
FSP (output norm/ input norm): 214558064.0
BSP (gradient norm of the first layer): 3455194368.0
GEV: 5.290103581501171e-05



for 128 layers ResGCN using Xavier initializations
FSP (output norm/ input no

# SPoGInit

In [17]:
layer_list = [4,8,16,32,64,128]
FSP_list = []
BSP_list = []
diri_list = []
for layer in layer_list:
    hidden=64
    #layers = str(layer)
    hidden_dim_list = [hidden]*(layer-1)
    set_seed()
    model = MyResGCN(input_dim, hidden, output_dim, layer,initialization="conventional", activation = "ReLU",dropout=0).cuda()
    Spog = SpogInit(model,data.edge_index,data.train_mask,data.x.shape[0],data.x.shape[1],dataset.num_classes, device, "divide_stable")
    # here we define the SPoGInit classï¼Œ the last item referes to different way to compute \hat{BSP} \hat{FSP}
    model.eval()
    Spog.zerosingle_initialization(data.x, data.y,lr=0.1,decay=1)
    # Then we use SpogInit to search for better initializations
    # lr here refers to the learning rate in the initialization optimization, 
    # w1,w2,w3 here refers to the weight for FSP, BSP and GEV
    # max_pati means the patience for the initialization optimization, if the number steps that metric not decreasing is larger that the max_pati, 
        # it will stop
    # steps means the maximum optimiztaion step
    # generate_data here means whether to use the random x, and y to search the initialization robust for the graph
    out = model.print_all_x(data.x, data.edge_index)
    out_final = model(data.x, data.edge_index)
    loss = F.nll_loss(out_final[data.train_mask], data.y[data.train_mask]).to(device)
    loss.backward()
    
    FSP_list.append(torch.norm(model.print_x(data.x,data.edge_index))/torch.norm(data.x))
    BSP_list.append(compute_weight_grad(model)[0])
    diri_list.append(new_smoothness_gpu(model.print_x(data.x,data.edge_index)).item())
    diri = new_smoothness_gpu(model.print_x(data.x,data.edge_index))


0-the iteration old_metric tensor([2.2306], device='cuda:0') metrics: tensor([0.9000, 0.9000, 0.9000, 0.9000])
1-the iteration old_metric tensor([2.0384], device='cuda:0') metrics: tensor([0.8000, 0.8000, 0.8000, 0.8000])
2-the iteration old_metric tensor([1.9284], device='cuda:0') metrics: tensor([0.7000, 0.7000, 0.7000, 0.7000])
3-the iteration old_metric tensor([1.7746], device='cuda:0') metrics: tensor([0.6000, 0.6000, 0.6000, 0.6000])
4-the iteration old_metric tensor([1.6345], device='cuda:0') metrics: tensor([0.5000, 0.5000, 0.5000, 0.5000])
5-the iteration old_metric tensor([1.5258], device='cuda:0') metrics: tensor([0.4000, 0.4000, 0.4000, 0.4000])
6-the iteration old_metric tensor([1.5172], device='cuda:0') metrics: tensor([0.3000, 0.3000, 0.3000, 0.3000])
7-the iteration old_metric tensor([1.4465], device='cuda:0') metrics: tensor([0.2000, 0.2000, 0.2000, 0.2000])
8-the iteration old_metric tensor([1.3792], device='cuda:0') metrics: tensor([0.1000, 0.1000, 0.1000, 0.1000])
9

In [21]:
for i in range(len(layer_list)):
    print(f"for {layer_list[i]} layers ResGCN using SpogInit initializations")
    print(f"FSP (output norm/ input norm): {FSP_list[i]}")
    print(f"BSP (gradient norm of the first layer): {BSP_list[i]}")
    print(f"GEV: {diri_list[i]}")
    print(f"\n\n")

for 4 layers ResGCN using SpogInit initializations
FSP (output norm/ input norm): 0.07045029103755951
BSP (gradient norm of the first layer): 0.25346508622169495
GEV: 0.08497168123722076



for 8 layers ResGCN using SpogInit initializations
FSP (output norm/ input norm): 0.11003374308347702
BSP (gradient norm of the first layer): 0.2573537826538086
GEV: 0.1228405311703682



for 16 layers ResGCN using SpogInit initializations
FSP (output norm/ input norm): 0.12955132126808167
BSP (gradient norm of the first layer): 0.24369166791439056
GEV: 0.10277865082025528



for 32 layers ResGCN using SpogInit initializations
FSP (output norm/ input norm): 0.5736527442932129
BSP (gradient norm of the first layer): 0.313423752784729
GEV: 0.05945737287402153



for 64 layers ResGCN using SpogInit initializations
FSP (output norm/ input norm): 0.6462151408195496
BSP (gradient norm of the first layer): 0.3561851382255554
GEV: 0.03891698643565178



for 128 layers ResGCN using SpogInit initializations
F