In [1]:
import torch, torch_geometric
from torch.nn import functional as F
from torch.nn import ReLU, Tanh,LeakyReLU
from torch_geometric import seed_everything
from torch_geometric.nn import Sequential
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from math import log
from torch_geometric.utils import to_networkx
import networkx as nx
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="True"
#os.environ["CUDA_VISIBLE_DEVICES"]='0'
import numpy as np
import argparse
from matplotlib import pyplot as plt
import pandas as pd
import random
from typing import Optional, Tuple

import torch
from torch import Tensor
from torch.nn import Parameter
from torch_scatter import scatter_add
from torch_sparse import SparseTensor, fill_diag, matmul, mul
from torch_sparse import sum as sparsesum

from torch_geometric.nn.conv import MessagePassing
from torch_geometric.nn.dense.linear import Linear
from torch_geometric.nn.inits import zeros
from torch_geometric.typing import Adj, OptTensor, PairTensor
from torch_geometric.utils import add_remaining_self_loops
from torch_geometric.utils.num_nodes import maybe_num_nodes
from torch_geometric.nn import inits
import time
import math
#import ogb
#from ogb.nodeproppred import PygNodePropPredDataset


In [2]:
import sys
from spoginit import SpogInit
from models.GCN_layer import MP

In [3]:
## define GCNConv with custom initializations


def gcn_norm(edge_index, edge_weight=None, num_nodes=None, improved=False,
             add_self_loops=True, flow="source_to_target", dtype=None):
    # type: (Tensor, OptTensor, Optional[int], bool, bool, str, Optional[int]) -> PairTensor  # noqa
    pass


@torch.jit._overload
def gcn_norm(edge_index, edge_weight=None, num_nodes=None, improved=False,
             add_self_loops=True, flow="source_to_target", dtype=None):
    # type: (SparseTensor, OptTensor, Optional[int], bool, bool, str, Optional[int]) -> SparseTensor  # noqa
    pass


def gcn_norm(edge_index, edge_weight=None, num_nodes=None, improved=False,
             add_self_loops=True, flow="source_to_target", dtype=None):

    fill_value = 2. if improved else 1.

    if isinstance(edge_index, SparseTensor):
        assert flow in ["source_to_target"]
        adj_t = edge_index
        if not adj_t.has_value():
            adj_t = adj_t.fill_value(1., dtype=dtype)
        if add_self_loops:
            adj_t = fill_diag(adj_t, fill_value)
        deg = sparsesum(adj_t, dim=1)
        deg_inv_sqrt = deg.pow_(-0.5)
        deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0.)
        adj_t = mul(adj_t, deg_inv_sqrt.view(-1, 1))
        adj_t = mul(adj_t, deg_inv_sqrt.view(1, -1))
        return adj_t

    else:
        assert flow in ["source_to_target", "target_to_source"]
        num_nodes = maybe_num_nodes(edge_index, num_nodes)

        if edge_weight is None:
            edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype,
                                     device=edge_index.device)

        if add_self_loops:
            edge_index, tmp_edge_weight = add_remaining_self_loops(
                edge_index, edge_weight, fill_value, num_nodes)
            assert tmp_edge_weight is not None
            edge_weight = tmp_edge_weight

        row, col = edge_index[0], edge_index[1]
        idx = col if flow == "source_to_target" else row
        deg = scatter_add(edge_weight, idx, dim=0, dim_size=num_nodes)
        deg_inv_sqrt = deg.pow_(-0.5)
        deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0)
        return edge_index, deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]
    
    
class GCNConv(MessagePassing):
    r"""The graph convolutional operator from the `"Semi-supervised
    Classification with Graph Convolutional Networks"
    <https://arxiv.org/abs/1609.02907>`_ paper

    .. math::
        \mathbf{X}^{\prime} = \mathbf{\hat{D}}^{-1/2} \mathbf{\hat{A}}
        \mathbf{\hat{D}}^{-1/2} \mathbf{X} \mathbf{\Theta},

    where :math:`\mathbf{\hat{A}} = \mathbf{A} + \mathbf{I}` denotes the
    adjacency matrix with inserted self-loops and
    :math:`\hat{D}_{ii} = \sum_{j=0} \hat{A}_{ij}` its diagonal degree matrix.
    The adjacency matrix can include other values than :obj:`1` representing
    edge weights via the optional :obj:`edge_weight` tensor.

    Its node-wise formulation is given by:

    .. math::
        \mathbf{x}^{\prime}_i = \mathbf{\Theta}^{\top} \sum_{j \in
        \mathcal{N}(v) \cup \{ i \}} \frac{e_{j,i}}{\sqrt{\hat{d}_j
        \hat{d}_i}} \mathbf{x}_j

    with :math:`\hat{d}_i = 1 + \sum_{j \in \mathcal{N}(i)} e_{j,i}`, where
    :math:`e_{j,i}` denotes the edge weight from source node :obj:`j` to target
    node :obj:`i` (default: :obj:`1.0`)

    Args:
        in_channels (int): Size of each input sample, or :obj:`-1` to derive
            the size from the first input(s) to the forward method.
        out_channels (int): Size of each output sample.
        improved (bool, optional): If set to :obj:`True`, the layer computes
            :math:`\mathbf{\hat{A}}` as :math:`\mathbf{A} + 2\mathbf{I}`.
            (default: :obj:`False`)
        cached (bool, optional): If set to :obj:`True`, the layer will cache
            the computation of :math:`\mathbf{\hat{D}}^{-1/2} \mathbf{\hat{A}}
            \mathbf{\hat{D}}^{-1/2}` on first execution, and will use the
            cached version for further executions.
            This parameter should only be set to :obj:`True` in transductive
            learning scenarios. (default: :obj:`False`)
        add_self_loops (bool, optional): If set to :obj:`False`, will not add
            self-loops to the input graph. (default: :obj:`True`)
        normalize (bool, optional): Whether to add self-loops and compute
            symmetric normalization coefficients on the fly.
            (default: :obj:`True`)
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
        **kwargs (optional): Additional arguments of
            :class:`torch_geometric.nn.conv.MessagePassing`.

    Shapes:
        - **input:**
          node features :math:`(|\mathcal{V}|, F_{in})`,
          edge indices :math:`(2, |\mathcal{E}|)`,
          edge weights :math:`(|\mathcal{E}|)` *(optional)*
        - **output:** node features :math:`(|\mathcal{V}|, F_{out})`
    """

    _cached_edge_index: Optional[Tuple[Tensor, Tensor]]
    _cached_adj_t: Optional[SparseTensor]

    def __init__(self, in_channels: int, out_channels: int,
                 improved: bool = False, cached: bool = False,
                 add_self_loops: bool = True, normalize: bool = True,
                 bias: bool = True, initialization = "glorot", **kwargs):

        kwargs.setdefault('aggr', 'add')
        super().__init__(**kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.improved = improved
        self.cached = cached
        self.add_self_loops = add_self_loops
        self.normalize = normalize
        self._cached_edge_index = None
        self._cached_adj_t = None
        self.initialization=initialization
        self.lin = Linear(in_channels, out_channels, bias=False)
        #if initialization == "glorot":
        #    self.lin = Linear(in_channels, out_channels, bias=False,
        #                      weight_initializer='glorot')
        #else: 
        #    self.lin = Linear(in_channels, out_channels, bias=False,
        #                      weight_initializer='kaiming_uniform')
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        #self.lin.reset_parameters()
        if self.initialization == "conventional":
            stdv = 1. / np.sqrt(self.lin.weight.size(1))
            self.lin.weight.data.uniform_(-stdv, stdv)
            #self.bias.data.uniform_(-stdv, stdv)
            zeros(self.bias)
        if self.initialization == "glorot":
            torch.nn.init.xavier_uniform_(self.lin.weight,gain=1.0)
            zeros(self.bias)
            #print("glorot")
        self._cached_edge_index = None
        self._cached_adj_t = None


    def forward(self, x: Tensor, edge_index: Adj,
                edge_weight: OptTensor = None) -> Tensor:
        """"""

        if self.normalize:
            if isinstance(edge_index, Tensor):
                cache = self._cached_edge_index
                if cache is None:
                    edge_index, edge_weight = gcn_norm(  # yapf: disable
                        edge_index, edge_weight, x.size(self.node_dim),
                        self.improved, self.add_self_loops, self.flow, x.dtype)
                    if self.cached:
                        self._cached_edge_index = (edge_index, edge_weight)
                else:
                    edge_index, edge_weight = cache[0], cache[1]

            elif isinstance(edge_index, SparseTensor):
                cache = self._cached_adj_t
                if cache is None:
                    edge_index = gcn_norm(  # yapf: disable
                        edge_index, edge_weight, x.size(self.node_dim),
                        self.improved, self.add_self_loops, self.flow, x.dtype)
                    if self.cached:
                        self._cached_adj_t = edge_index
                        
                else:
                    edge_index = cache

        x = self.lin(x)

        # propagate_type: (x: Tensor, edge_weight: OptTensor)
        out = x
        out = self.propagate(edge_index, x=out, edge_weight=edge_weight,
                                 size=None)

        if self.bias is not None:
            out = out + self.bias

        return out

    def message(self, x_j: Tensor, edge_weight: OptTensor) -> Tensor:
        return x_j if edge_weight is None else edge_weight.view(-1, 1) * x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        return matmul(adj_t, x, reduce=self.aggr)

##   https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/conv/gcn2_conv.html   

class MP(MessagePassing):
    _cached_edge_index: Optional[Tuple[Tensor, Tensor]]
    _cached_adj_t: Optional[SparseTensor]

    def __init__(self,
                 improved: bool = False, cached: bool = False,
                 add_self_loops: bool = True, normalize: bool = True,
                 bias: bool = True,times = 1, **kwargs):

        kwargs.setdefault('aggr', 'add')
        super().__init__(**kwargs)
        self.improved = improved
        self.cached = cached
        self.add_self_loops = add_self_loops
        self.normalize = normalize
        #self.init = initialization
        self.times = times
        self._cached_edge_index = None
        self._cached_adj_t = None
    def forward(self, x: Tensor, edge_index: Adj,
                edge_weight: OptTensor = None) -> Tensor:
        """"""

        if self.normalize:
            if isinstance(edge_index, Tensor):
                cache = self._cached_edge_index
                if cache is None:
                    edge_index, edge_weight = gcn_norm(  # yapf: disable
                        edge_index, edge_weight, x.size(self.node_dim),
                        self.improved, self.add_self_loops, self.flow, x.dtype)
                    if self.cached:
                        self._cached_edge_index = (edge_index, edge_weight)
                else:
                    edge_index, edge_weight = cache[0], cache[1]

            elif isinstance(edge_index, SparseTensor):
                cache = self._cached_adj_t
                if cache is None:
                    edge_index = gcn_norm(  # yapf: disable
                        edge_index, edge_weight, x.size(self.node_dim),
                        self.improved, self.add_self_loops, self.flow, x.dtype)
                    if self.cached:
                        self._cached_adj_t = edge_index
                        
                else:
                    edge_index = cache
        out = x
        for i in range(self.times):
            out = self.propagate(edge_index, x=out, edge_weight=edge_weight,
                                 size=None)
        return out


    def message(self, x_j: Tensor, edge_weight: OptTensor) -> Tensor:
        return x_j if edge_weight is None else edge_weight.view(-1, 1) * x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        return matmul(adj_t, x, reduce=self.aggr)

In [4]:
# load data
device = "cuda:0"
dataset = Planetoid(root='./data', name="Cora")
data = dataset[0].to(device)
train_mask = data.train_mask.to(device)
val_mask = data.val_mask.to(device)
test_mask = data.test_mask.to(device)
train_idx= train_mask
print("train set:", train_mask.sum()/data.x.shape[0])


print("data norm: ", torch.norm(data.x))




input_dim, output_dim = dataset.num_features, dataset.num_classes

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


train set: tensor(0.0517, device='cuda:0')
data norm:  tensor(221.8468, device='cuda:0')


In [5]:
# set seed functions and prepare model for computing the GEV
import random
import numpy as np

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)


messagepa = MP(cached = True).to(device)
def new_smoothness_gpu(x):
    smooth = torch.zeros(1)
    smooth = smooth.to(device)
    for k in range(x.shape[1]):
        u = x[:,k].to(device)
        if torch.matmul(u.T,u) == 0:
            smooth += 0
        else:
            #smooth += torch.matmul(u.T,torch.matmul(L,u))/torch.matmul(u.T,u)
            smooth += torch.matmul(u.T,u.reshape(-1,1)-messagepa(u.reshape(-1,1),data.edge_index))/torch.matmul(u.T,u)
    final =0.5 * smooth / x.shape[1] 
    return final

In [6]:
class MyGCN(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim_list, initialization,activation,withbias,dropout):
        super(MyGCN,self).__init__()
        dim_list = [input_dim] + hidden_dim_list + [output_dim]
        layers = []
        self.dropout = dropout

        if activation == "ReLU":
            self.activation = ReLU()
        elif activation == "Tanh":
            self.activation = Tanh()
        for i in range(len(dim_list) - 2):
            layers.append((GCNConv(dim_list[i], dim_list[i+1],cached=True,bias=withbias, initialization=initialization), 'x, edge_index -> x'))
        layers.append((GCNConv(dim_list[len(dim_list)-2], dim_list[len(dim_list)-1],cached=True,bias=withbias, initialization=initialization), 'x, edge_index -> x'))
        self.convs = torch_geometric.nn.Sequential('x, edge_index', layers)
        
    #def compute_smooth(self, x,edge_index):
        #x, edge_index = x, edge_index
        #x = self.seq(x, edge_index)
    #    return smoothness(x)
    def print_x(self,x, edge_index):
        #x, edge_index = x, edge_index
        #x = self.seq(x, edge_index)
        for i in range(len(self.convs)-1):
            x = self.convs [i](x,edge_index)
            x = self.activation(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x,edge_index)
        return x
    def forward(self, x, edge_index):
        y = self.print_x(x, edge_index)
        return F.log_softmax(y, dim=1)
    ## print H l-1
    def print_all_x(self, x, edge_index):
        z = []
        k=0
        for i in range(len(self.convs)-1):
            x = self.convs[i](x,edge_index)
            x = self.activation(x)
            z.append(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x,edge_index)
        z.append(x)
        return z

In [7]:
import random
import numpy as np

def set_seed():
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
        torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = False
    torch.use_deterministic_algorithms(False)

# function to read the gradients in models
def compute_weight_grad(model):
    z = []
    for a,para in model.named_parameters():
        if "weight" in a:
            #print(a)
            #print(grad.grad)
            b = torch.norm(para.grad,dim=None)
            z.append(b.detach())
    z = torch.tensor(z)
    return z.cpu().detach().numpy()

#  Conventional Initialization

In [8]:
layer_list = [4,8,16,32,64,128,256] # we use this to control the num of layer in GCNs

for layer in layer_list:
    hidden=64
    #layers = str(layer)
    hidden_dim_list = [hidden]*(layer-1)
    set_seed()
    model = MyGCN(input_dim, output_dim, hidden_dim_list,initialization="conventional", activation = "Tanh",withbias=True,dropout = 0).cuda()
    #uniform here refers to the Conventional initializaton

    # print_all_x aims to returns the output of hidden layers
    out = model.print_all_x(data.x, data.edge_index)
    # out_final aims to produce the output logits
    out_final = model(data.x, data.edge_index)
    loss = F.nll_loss(out_final[data.train_mask], data.y[data.train_mask]).to(device)
    loss.backward()
    
    print(f"for {layer} layers GCN Conventional initializations")
    print(f"FSP (output norm/ input norm): {torch.norm(model.print_x(data.x,data.edge_index))/torch.norm(data.x)}")
    print(f"BSP (gradient norm of the first layer): {compute_weight_grad(model)[0]}")
    diri = new_smoothness_gpu(model.print_x(data.x,data.edge_index))
    print(f"GEV: {diri.item()}")
    print(f"\n\n")

for 4 layers GCN Conventional initializations
FSP (output norm/ input norm): 0.0030888437759131193
BSP (gradient norm of the first layer): 0.05142880603671074
GEV: 0.02799740433692932



for 8 layers GCN Conventional initializations
FSP (output norm/ input norm): 0.0002364598331041634
BSP (gradient norm of the first layer): 0.00422302633523941
GEV: 0.0138296689838171



for 16 layers GCN Conventional initializations
FSP (output norm/ input norm): 2.5642777927714633e-06
BSP (gradient norm of the first layer): 4.4395026634447277e-05
GEV: 0.005229322239756584



for 32 layers GCN Conventional initializations
FSP (output norm/ input norm): 3.759946798354008e-10
BSP (gradient norm of the first layer): 5.23944887476091e-09
GEV: 0.0009726702701300383





  if torch.matmul(u.T,u) == 0:


for 64 layers GCN Conventional initializations
FSP (output norm/ input norm): 7.367102973149392e-18
BSP (gradient norm of the first layer): 9.853266268890809e-17
GEV: 0.0005189272342249751



for 128 layers GCN Conventional initializations
FSP (output norm/ input norm): 0.0
BSP (gradient norm of the first layer): 0.0
GEV: 0.0



for 256 layers GCN Conventional initializations
FSP (output norm/ input norm): 0.0
BSP (gradient norm of the first layer): 0.0
GEV: 0.0





# Xavier Initialization

In [9]:
layer_list = [4,8,16,32,64,128,256]

for layer in layer_list:
    hidden=64
    #layers = str(layer)
    hidden_dim_list = [hidden]*(layer-1)
    set_seed()
    model = MyGCN(input_dim, output_dim, hidden_dim_list,initialization="glorot", activation = "Tanh",withbias=True,dropout = 0).cuda()
    #glorot here refers to the Xavier initializaton

    # print_all_x aims to returns the output of hidden layers
    out = model.print_all_x(data.x, data.edge_index)
    # out_final aims to produce the output logits
    out_final = model(data.x, data.edge_index)
    loss = F.nll_loss(out_final[data.train_mask], data.y[data.train_mask]).to(device)
    loss.backward()
    
    print(f"for {layer} layers GCN using Xavier initializations")
    print(f"FSP (output norm/ input norm): {torch.norm(model.print_x(data.x,data.edge_index))/torch.norm(data.x)}")
    print(f"BSP (gradient norm of the first layer): {compute_weight_grad(model)[0]}")
    diri = new_smoothness_gpu(model.print_x(data.x,data.edge_index))
    print(f"GEV: {diri.item()}")
    print(f"\n\n")

for 4 layers GCN using Xavier initializations
FSP (output norm/ input norm): 0.05046107992529869
BSP (gradient norm of the first layer): 0.3500046730041504
GEV: 0.028234850615262985



for 8 layers GCN using Xavier initializations
FSP (output norm/ input norm): 0.0336218997836113
BSP (gradient norm of the first layer): 0.25388163328170776
GEV: 0.014304648153483868



for 16 layers GCN using Xavier initializations
FSP (output norm/ input norm): 0.027770785614848137
BSP (gradient norm of the first layer): 0.2107459008693695
GEV: 0.005845851264894009



for 32 layers GCN using Xavier initializations
FSP (output norm/ input norm): 0.0256253182888031
BSP (gradient norm of the first layer): 0.15439654886722565
GEV: 0.0010823275661095977



for 64 layers GCN using Xavier initializations
FSP (output norm/ input norm): 0.01974853128194809
BSP (gradient norm of the first layer): 0.1169433444738388
GEV: 0.0005557640106417239



for 128 layers GCN using Xavier initializations
FSP (output norm/ inp

# SPoGInit

In [10]:
layer_list = [4,8,16,32,64,128,256]
FSP_list = []
BSP_list = []
diri_list = []
for layer in layer_list:
    hidden=64
    #layers = str(layer)
    hidden_dim_list = [hidden]*(layer-1)
    set_seed()
    model = MyGCN(input_dim, output_dim, hidden_dim_list,initialization="glorot", activation = "Tanh",withbias=True,dropout = 0).cuda()
    Spog = SpogInit(model,data.edge_index,data.train_mask,data.x.shape[0],data.x.shape[1],dataset.num_classes, device, "divide_old")
    # here we define the SPoGInit class， the last item referes to different way to compute \hat{BSP} \hat{FSP}
    model.eval()
    Spog.zeroincrease_initialization(x = data.x, y = data.y,lr = 0.05,w2=10,max_pati=10,steps=100, generate_data = False)
    # Then we use SpogInit to search for better initializations
    # lr here refers to the learning rate in the initialization optimization, 
    # w1,w2,w3 here refers to the weight for FSP, BSP and GEV
    # max_pati means the patience for the initialization optimization, if the number steps that metric not decreasing is larger that the max_pati, 
        # it will stop
    # steps means the maximum optimiztaion step
    # generate_data here means whether to use the random x, and y to search the initialization robust for the graph
    out = model.print_all_x(data.x, data.edge_index)
    out_final = model(data.x, data.edge_index)
    loss = F.nll_loss(out_final[data.train_mask], data.y[data.train_mask]).to(device)
    loss.backward()
    
    FSP_list.append(torch.norm(model.print_x(data.x,data.edge_index))/torch.norm(data.x))
    BSP_list.append(compute_weight_grad(model)[0])
    diri_list.append(new_smoothness_gpu(model.print_x(data.x,data.edge_index)).item())
    diri = new_smoothness_gpu(model.print_x(data.x,data.edge_index))


0-the iteration total metric tensor([5.7977], device='cuda:0') sigmas: tensor([1.0500, 1.0500, 1.0500, 1.0500])
1-the iteration total metric tensor([3.8591], device='cuda:0') sigmas: tensor([1.0850, 1.0150, 1.0150, 1.0850])
2-the iteration total metric tensor([4.2671], device='cuda:0') sigmas: tensor([1.1200, 1.0500, 1.0500, 1.1200])
3-the iteration total metric tensor([3.2076], device='cuda:0') sigmas: tensor([1.1550, 1.0850, 1.0850, 1.1550])
4-the iteration total metric tensor([2.4075], device='cuda:0') sigmas: tensor([1.1200, 1.1200, 1.1200, 1.1900])
5-the iteration total metric tensor([1.7844], device='cuda:0') sigmas: tensor([1.0850, 1.1550, 1.0850, 1.2250])
6-the iteration total metric tensor([1.5899], device='cuda:0') sigmas: tensor([1.0500, 1.1200, 1.1200, 1.2600])
7-the iteration total metric tensor([1.4572], device='cuda:0') sigmas: tensor([1.0850, 1.1550, 1.1550, 1.2950])
8-the iteration total metric tensor([1.0797], device='cuda:0') sigmas: tensor([1.0500, 1.1900, 1.1200, 1

In [11]:
for i in range(len(layer_list)):
    print(f"for {layer_list[i]} layers GCN using SpogInit initializations")
    print(f"FSP (output norm/ input norm): {FSP_list[i]}")
    print(f"BSP (gradient norm of the first layer): {BSP_list[i]}")
    print(f"GEV: {diri_list[i]}")
    print(f"\n\n")

for 4 layers GCN using SpogInit initializations
FSP (output norm/ input norm): 0.1502782702445984
BSP (gradient norm of the first layer): 1.1928271055221558
GEV: 0.028637219220399857



for 8 layers GCN using SpogInit initializations
FSP (output norm/ input norm): 0.1554192304611206
BSP (gradient norm of the first layer): 1.3261399269104004
GEV: 0.01603308506309986



for 16 layers GCN using SpogInit initializations
FSP (output norm/ input norm): 0.13850003480911255
BSP (gradient norm of the first layer): 1.5851898193359375
GEV: 0.00859205424785614



for 32 layers GCN using SpogInit initializations
FSP (output norm/ input norm): 0.1921989470720291
BSP (gradient norm of the first layer): 1.401915431022644
GEV: 0.0019310959614813328



for 64 layers GCN using SpogInit initializations
FSP (output norm/ input norm): 0.08005011081695557
BSP (gradient norm of the first layer): 0.5295993089675903
GEV: 0.0008343437220901251



for 128 layers GCN using SpogInit initializations
FSP (output norm