In [None]:
import dgl
import dgl.data
from ConnectingEnvelopes import SimplicialFinder
import json
from tqdm.auto import tqdm

#import Cora dataset
dataset = dgl.data.CoraGraphDataset()
cora_graph = dataset[0]

In [None]:
#create instance of simplicial finder for Cora dataset
heavy_duty=SimplicialFinder(cora_graph)

print("Finding simplices..") 
for index in tqdm(range(6)):
    heavy_duty.connectivity_update()

print("Saving data to json file")
data_to_save = {"zero_skel_dict": {str(key): str(value) for key, value in heavy_duty.zero_skeleton_dict.items()}, "standard_0_skeleton_dict":{str(key): str(value) for key, value in heavy_duty.standard_0_skeleton_dict.items()}}

with open("./cora_simplices.json", "w") as file:
    json.dump(data_to_save, file)

In [None]:
#verify that simplices of dimension greater than 4 are non-existent
heavy_duty.highest_dimension()

In [None]:
#check if the file was saved properly and perform necessary conversions
with open("./cora_simplices.json", 'r') as file:
    data = json.load(file)
import ast
cora_simplices = dict()
cora_simplices = {ast.literal_eval(key): ast.literal_eval(value) for key, value in data['zero_skel_dict'].items()}

#print(cora_simplices)

In [None]:
#standardise dictionary of simplices for AdjFunctor
cora_simplices_standardised = dict()

dimensions = cora_simplices.keys()
dim_std = []
for _tuple in dimensions:
    if _tuple[0] not in dim_std:
        dim_std = dim_std + [_tuple[0]]
        
cora_simpl_std = dict()

for key in dim_std:
    cora_simpl_std.update({key:[]})
    
for key, value in cora_simplices.items():
    cora_simpl_std[key[0]] = cora_simpl_std[key[0]] + [value]
#print(cora_simpl_std)

In [None]:
from AdjFunctor import AdjFunctor

#create adj graph
cora_adj = AdjFunctor(cora_simpl_std)
#cora_adj.fill_edges()

In [None]:
#save graph
from dgl.data.utils import save_graphs
name = 'cora_adj_no_feats'
save_graphs(name, cora_adj.adj_graph)

In [None]:
#load this saved graph, if necessary
from dgl.data.utils import load_graphs
name = 'cora_adj_no_feats'
cora_adj_graph = load_graphs(name)[0][0]

In [None]:
#add feats to adj_graph without leaks
#warning! Use either leaky or non-leaky! 
from HigherFeats import CreateFeatures

path = "./cora_adj_graph_feats_n_masks_nonleaky.json"
feats_adj_graph = CreateFeatures(adj_graph=cora_adj_graph, 
                                  mapping=cora_adj.adj_graph_node_dict, original_graph=cora_graph, 
                                  tosave=path, data_leak = False)

In [None]:
#verify if adj graph is saved
from dgl.data.utils import load_graphs

path = "./cora_adj_graph_feats_n_masks_nonleaky.json"
cora_adj_noleaks = load_graphs(path)[0][0]
#print(cora_adj_noleaks)

In [None]:
#import other libraries for NN
import torch
import os
from torch.sparse import *
os.environ['DGLBACKEND'] = 'pytorch'
from pathlib import Path
from GSN import CinchNETConv
import torch.nn as nn
import torch.nn.functional as F
from utils import plot_losses, train, train_step, get_experiment_stats
from utils import test, characterize_performance, norm_plot
from scipy import stats

In [None]:
#define the model
class CinchNET(nn.Module):
    def __init__(self, graph, input_layer:int, hidden_layers:int, output_layer:int, num_layers:int, dropout, max_dim):
        self.input_layer   = input_layer
        self.hidden_layers = hidden_layers
        self.output_layer  = output_layer
        self.num_layers    = num_layers
        self.dropout       = dropout
        self.graph         = graph
        self.maximum_dim   = max_dim
        super(CinchNET, self).__init__()
        self.convs         = nn.ModuleList()
        self.bns           = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm1d(hidden_layers))
        self.convs.append(CinchNETConv(in_feats=input_layer, out_feats=hidden_layers, maximum_dim =max_dim))
        
        for _ in range(num_layers - 2):
            self.convs.append(CinchNETConv(in_feats=hidden_layers, out_feats=hidden_layers,maximum_dim =max_dim))
            self.bns.append(torch.nn.BatchNorm1d(hidden_layers))
        self.convs.append(CinchNETConv(in_feats=hidden_layers, out_feats=output_layer,maximum_dim =max_dim))
        
    def forward(self, graph, input_features):
        for conv in self.convs[:-1]:
            input_features = conv(graph, input_features)
            input_features = F.relu(input_features)
            input_features = F.dropout(input_features, p=self.dropout, training=self.training)
        input_features = self.convs[-1](graph, input_features)
        #change non-linearity here
        return input_features.log_softmax(dim=-1)
    
    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

In [None]:
num_class = dataset.num_classes
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

#hidden layers chosen to be twice the size of input
model_kwargs = dict(graph=cora_adj_noleaks, input_layer=7165, 
                     hidden_layers=7165, output_layer=num_class, 
                 num_layers=2, dropout=0.5,max_dim=4)
model = CinchNET(**model_kwargs).to(device)

# Where to save the best model
model_path = 'models'
Path(model_path).mkdir(parents=True, exist_ok=True)
cincnet_path = f"{model_path}/CinchNETe450l2noleaks.model"

train_args = dict(
    graph=cora_adj_noleaks, epochs=200,  device=device, 
    save_path=cincnet_path, lr=5e-3, es_criteria=50, labels = cora_adj_noleaks.ndata['label']
)

train_losses, val_losses = train(model=model, verbose=True, **train_args)
plot_losses(train_losses, val_losses, log=True, modelname='CinchNETe450l2noleaks')

_ = characterize_performance(model, cora_adj_noleaks, cincnet_path, verbose=True)

In [None]:
df_gcn = get_experiment_stats(
    model_cls=CinchNET, model_args=model_kwargs,
    train_args=train_args, n_experiments=10
)

#comparison with Topology Adaptive Graph Convolutional Network
test_acc_prev = 82.5
test_acc_var_prev = 0.7
print("Creating norm plots for comparison")
norm_plot(
    [
        (test_acc_prev, test_acc_var_prev, 'TAGCN'), 
        (df_gcn.loc['mean', 'test_acc'], df_gcn.loc['std', 'test_acc'], 'CinchNET'),
    ],
    'Test Performance'
)

# Conduct a Welch's t-test to determine if the means are different with statistical significance 
_, p = stats.ttest_ind_from_stats(
    test_acc_prev, test_acc_var_prev, 10,
    df_gcn.loc['mean', 'test_acc'], df_gcn.loc['std', 'test_acc'], 10,
    equal_var=False,
)
print(f"Mean Test Accuracy Improvement: {(df_gcn.loc['mean', 'test_acc'] - test_acc_lb):.4f}")
print(f"Probability that these are from the same performance distribution = {p*100:.0f}%")