#Importing libraries and GitHub repo cloning

In [None]:
!pip install torch_geometric

In [None]:
import pickle

# Visualization related imports
import matplotlib.pyplot as plt
import networkx as nx

# Main computation libraries
import scipy.sparse as sp
import numpy as np

# Deep learning related imports
import torch

import collections
import collections.abc

import os
import typing
import torch.nn.functional as F
import torch_geometric.datasets as datasets

from torch_geometric.nn import GCNConv

In [None]:
!pip install munch

In [None]:
!pip install ruamel.yaml

In [None]:
!pip install tap

In [None]:
collections.Mapping = collections.abc.Mapping
collections.MutableMapping = collections.abc.MutableMapping
collections.Callable = collections.abc.Callable

#Loading models

In [None]:
from gcn import GCN
from gat import GAT

#Loading GOOD datasets

In [None]:
from digcopy.dig.oodgraph import GOODMotif,GOODHIV,GOODCBAS,GOODCora,GOODArxiv

In [None]:
data_cora_degree, meta_cora_degree = GOODCora.load('datasets', 'degree', shift='covariate')
good_cora_degree=data_cora_degree[0]

In [None]:
data_cora_word, meta_cora_word = GOODCora.load('datasets', 'word', shift='covariate')
good_cora_word=data_cora_word[0]

In [None]:
data_arxiv_degree, meta_arxiv_degree = GOODArxiv.load('datasets', 'degree', shift='covariate')
good_arxiv_degree=data_arxiv_degree[0]

In [None]:
data_arxiv_time, meta_arxiv_time = GOODArxiv.load('datasets', 'time', shift='covariate')
good_arxiv_time=data_arxiv_time[0]

In [None]:
data_cbas, meta_cbas = GOODCBAS.load(dataset_root='datasets', domain='color', shift='covariate')
good_cbas=data_cbas[0]

#Training functions

In [None]:
import copy

def train(
    params: typing.Dict,good,meta,device,model=None,rate_train_mask=1,earlystop=True,nepochs=None,train_mask=None,optimizer=None,
) -> torch.nn.Module:
  """
    This function trains a node classification model and returns the trained model object.
    Args :
        params (dict): Training hyper-parameters
        good (Dataset): Dataset object containing graph data as well as training masks
        meta (dict): meta-data
        device (PyTorch device)
        model (torch.nn.Module): specifies initial model in case of re-training (helpful for our experimental setup)
        rate_train_mask (float in [0,1]) : fraction of initial training set used as training data
        earlystop (bool) : determines if early stopping will be used to regularize
        nepochs (int) : potential respecification of the number of epochs
        train_mask (torch.Tensor(bool)) : specification of train_mask to have a consistent fraction of the training set across different models
        optimzer (PyTorch optimizer)
    Returns :
        model (PyTorch model) : trained model
        TrainACC (list of float) : history of train accuracies over the epochs
        TestACC (list of float) : history of test OOD accuracies
        ValIDACC (list of float) : history of validation ID accuracies
  """

  # Set Device
  data = good.to(device)

  # Reduces the training set if needed by only selecting a fraction
  if train_mask is None:
    train_mask=torch.zeros(data.train_mask.shape[0],dtype=torch.bool)
    if rate_train_mask<1:
      for i in range (0,data.train_mask.shape[0]):
        if data.train_mask[i] and np.random.uniform()<=rate_train_mask:
          train_mask[i]=True
    else:
      train_mask=copy.deepcopy(data.train_mask)

  # Training data
  datatrainx=data.x[train_mask]
  datatrainy=data.y[train_mask]

  # Update parameters
  params["n_classes"] = meta.num_classes # number of target classes
  print(f"num classes: {meta.num_classes}")
  params["input_dim"] = meta.dim_node # size of input features

  model.train()

  # Set Adam optimizer
  if optimizer is None:
    optimizer=torch.optim.Adam(model.parameters(),lr=params["lr"],weight_decay=params["weight_decay"])

  # Set loss
  criterion=nn.CrossEntropyLoss()

  best_accuracy=0
  decreasing_accuracy_count=0
  losses=[]
  TESTACC=[]
  VALIDACC=[]
  TRAINACC=[]

  # nepochs allows to respecify the number of epochs in case it is not in the provided training_params
  if nepochs is None:
    nepochs=params["epochs"]

  for i in range (0,nepochs):
    optimizer.zero_grad()

    # Forward and backward passes
    logits=model(data.x,data.edge_index)
    loss=criterion(logits[train_mask],datatrainy)
    loss.backward()
    preds=torch.argmax(logits[train_mask],dim=1)
    accuracy=torch.sum(preds==datatrainy)/preds.shape[0]
    optimizer.step()

    # ID Validation and OOD Test performances
    testacc=evaluate(model,data,data.test_mask,printb=False)
    validacc=evaluate(model,data,data.val_maskid,printb=False)
    TESTACC.append(testacc.item())
    VALIDACC.append(validacc.item())
    TRAINACC.append(accuracy.item())
    losses.append(loss.item())

    # Early stopping
    if testacc>best_accuracy:
      decreasing_accuracy_count=0
    else:
      decreasing_accuracy_count+=1
    best_accuracy=max(best_accuracy,testacc)
    if decreasing_accuracy_count>=params["max_patience"] and earlystop:
      print(f"Early stopping at epoch {i}")
      break

    #Displaying performance
    if i%params["print_time"]==0:
      print(f"Epoch {i}, Loss : {loss.item()}, Accuracy : {accuracy}")
      evaluate(model,data,data.test_mask,printb=True)

  return(model,TRAINACC,TESTACC,VALIDACC,train_mask)

  #apply train mask to dataset
  #define optimizer

In [None]:
def evaluate(
    model,
    data,
    mask,
    printb=True
):
    """"
    This function evaluates the performance of the model on the provided mask
    Args :
        model (PyTorch model) : model to be evaluated
        data (Dataset) : data on which evaluation is done
        mask (torch.Tensor(bool)) : mask determining the data subset on which evaluation is done
        printb (bool) : determines if the evaluated loss and accuracies are printed for the user
    Returns :
        accuracy (float) : accuracy of the model
    """"

    datax=data.x
    datay=data.y
    model.eval()
    with torch.no_grad():
      logits=model(data.x,data.edge_index)
      loss=F.cross_entropy(logits[mask],datay[mask])
      preds=torch.argmax(logits[mask],dim=1)
      accuracy=torch.sum(preds==datay[mask])/preds.shape[0]
      if printb:
        print(f"Eval Loss : {loss.item()}, Eval Accuracy :{accuracy}")
      return(accuracy)

#Data preprocessing

In [None]:
def add_validation_maskid(data,rate=0.2):

  """
  Preprocessing of Dataset objects to extract an In-Distribution validation dataset from the training dataset
  (typically containing 20% of the training samples)
  """

  val_maskid=torch.zeros(data.train_mask.shape[0],dtype=torch.bool)
  for i in range (0,data.train_mask.shape[0]):
    if data.train_mask[i] and np.random.uniform()<=rate:
      val_maskid[i]=True
      data.train_mask[i]=False
  data.val_maskid=val_maskid

#add_validation_maskid(good_cora_degree)
#add_validation_maskid(good_cora_word)
add_validation_maskid(good_arxiv_degree)
add_validation_maskid(good_arxiv_time)
#add_validation_maskid(good_cbas)

In [1]:
import copy

def retrainings(n,data,meta,model,train_mask,optimizer,device):
  """
  Re-training procedure at the heart of our experimental setup. This functions takes a pre-trained model as input and performs n independent re-trainings over a specified number of epochs, gathers the corresponding accuracies and returns them

  Args :
      n (int) : number of re-trainings
      data (Dataset) : dataset used
      meta (dict) : metadata of the dataset used
      model (PyTorch model) : pre-trained model regularized with early stopping
      train_mask (torch.Tensor(bool)) : mask to specify the training set so that it is consistent with the pre-training
      optimizer (PyTorch optimizer) : pre-training optimizer, re-used here to guarantee stable first re-training epochs
      device (PyTorch device)

  Returns :
      AllTrainAcc (list) : Concatenated training accuracies
      AllTestOODAcc (list) : Concatenated test OOD accuracies
      AllValIDAcc (list) : Concatenated validation ID accuracies
  """
  LTrainAcc,LTestOODAcc,LValIDAcc=[],[],[]


  for i in range (0,n):
    #Copy pretrained model
    model2=copy.deepcopy(model)

    #Copy pre-training optimizer
    copied_optimizer = type(optimizer)(
    model2.parameters(),  # Use parameters from the copied model
    **optimizer.defaults  # Copy optimizer settings (e.g., lr, momentum)
    )
    copied_optimizer.load_state_dict(optimizer.state_dict())

    #Re-training of the model
    model3,TrainAcc,TestOODAcc,TestIDAcc,train_mask2=train(training_params,data,meta,model=model2,earlystop=False,train_mask=train_mask,optimizer=copied_optimizer,device=device)
    LTrainAcc.append(np.array(TrainAcc))
    LTestOODAcc.append(np.array(TestOODAcc))
    LValIDAcc.append(np.array(TestIDAcc))

  #Concatenation of obtained accuracies
  AllTrainAcc=np.view(np.array(LTrainAcc),-1)
  AllTestOODAcc=np.view(np.array(LTestOODAcc),-1)
  AllValIDAcc=np.mean(np.array(LValIDAcc),-1)


  return(AllTrainAcc,AllTestOODAcc,AllValIDAcc)

#Experiments

##Training procedures

In [None]:
training_params = {
    "lr": 0.004,  # learning rate
    "weight_decay": 0.0005,  # weight_decay
    "epochs": 200,  # number of total training epochs
    "max_patience": 10, # number of k for early stopping
    "hid_dim": 128, # size of hidden features
    "n_layers": 2, # number of layers
    "n_heads": 1,
    "print_time":20,
    "model_name": "GCN",
    "n_layers_mlp":2
}

device = "cuda" if torch.cuda.is_available() else "cpu"

data=good_arxiv_degree
meta=meta_arxiv_degree

if training_params['model_name'] == 'GCN':
    model=GCN(meta["dim_node"],training_params["hid_dim"],training_params["n_heads"],meta["num_classes"],training_params["n_layers"],n_layers_mlp=training_params["n_layers_mlp"]).to(device)
elif training_params['model_name'] == 'GAT':
    model=GAT(meta["dim_node"],training_params["hid_dim"],training_params["n_heads"],meta["num_classes"],training_params["n_layers"],n_layers_mlp=training_params["n_layers_mlp"]).to(device)
elif training_params['model_name'] == 'GATv2':
    model=GATv2(meta["dim_node"],training_params["hid_dim"],training_params["n_heads"],meta["num_classes"],training_params["n_layers"]).to(device)
else:
    raise NotImplementedError

optimizer=torch.optim.Adam(model.parameters(),lr=training_params["lr"],weight_decay=training_params["weight_decay"])

In [None]:
#PRETRAINING
model,TrainAcc,TestOODAcc,ValIDAcc,train_mask=train(training_params,data,meta,device=device,nepochs=500,rate_train_mask=1,optimizer=optimizer,model=model,earlystop=False)

In [None]:
n_runs=5
training_params["epochs"]=16
TrainAcc,TestOODAcc,ValIDAcc=retrainings(n_runs,data, meta,model=model,train_mask=train_mask,optimizer=optimizer,device=device)

##Results Plots

In [None]:
#Training curves

X=[i for i in range (0,len(TrainAcc))]
plt.plot(X,TrainAcc,label="Training accuracy")
plt.plot(X,TestOODAcc,label="Test OOD accuracy")
plt.plot(X,ValIDAcc,label="Val ID accuracy")
plt.plot(X,[max(ValIDAcc)]*len(X),color="green",linestyle="dotted")
plt.plot(X,[max(TestOODAcc)]*len(X),color="orange",linestyle="dotted")
plt.legend()
plt.title(f"Evolution of losses for {n_runs} averaged runs : CORA Dataset")
plt.xlim(0,500)
plt.xlabel("Epochs")
plt.ylabel("Accuracy [%]")
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
#Correlation plots

plt.plot(TestOODAcc[0:100],ValIDAcc[0:100],"ob",color="grey")
plt.ylabel("Accuracy (%, val-id)")
plt.xlabel("Accuracy (%, test-ood)")
plt.grid()
plt.show()

In [None]:
print(max(TestOODAcc))
print(max(ValIDAcc))

##Indicative - Graph visualization functions

In [None]:
import torch
import matplotlib.pyplot as plt
import networkx as nx
from torch_geometric.utils import to_networkx

def plot_graph(edge_index, node_features=None):

    """
    Graph plotting function

    Args :
        edge_index : edges of the graph
        node_features torch.Tensor (float) : node features of the graph
    """
    G = nx.Graph()
    edges = edge_index.t().tolist()
    G.add_edges_from(edges)
    pos = nx.spring_layout(G, seed=42)
    plt.figure(figsize=(8, 6))

    nx.draw(
        G,
        pos,
        with_labels=True,
        node_color='lightblue',
        edge_color='gray',
        node_size=500,
        font_size=10
    )

    if node_features is not None:
        for i, (x, y) in enumerate(pos.values()):
            plt.text(
                x, y + 0.05,
                s=f"{node_features[i]}",
                horizontalalignment='center',
                fontsize=8,
                bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5)
            )

    plt.title("Graph Visualization")
    plt.show()


plot_graph(good_cora[0].edge_index)



##Indicative - influence of number of parameters (not included in the mini-project report)

In [None]:
import copy

def average_runs(n,data,meta,model,train_mask,optimizer,device):
  """
  Averaging procedure

  Args :
      n (int) : number of re-trainings
      data (Dataset) : dataset used
      meta (dict) : metadata of the dataset used
      model (PyTorch model) : pre-trained model regularized with early stopping
      train_mask (torch.Tensor(bool)) : mask to specify the training set so that it is consistent with the pre-training
      optimizer (PyTorch optimizer) : pre-training optimizer, re-used here to guarantee stable first re-training epochs
      device (PyTorch device)

  Returns :
      AvgTrainAcc (list) : Averaged training accuracies
      AvgTestOODAcc (list) : Averaged test OOD accuracies
      AvgValIDAcc (list) : Averaged validation ID accuracies
  """

  LTrainAcc,LTestOODAcc,LTestIDAcc=[],[],[]
  for i in range (0,n):
    model2=copy.deepcopy(model)
    copied_optimizer = type(optimizer)(
    model2.parameters(),
    **optimizer.defaults
)

    copied_optimizer.load_state_dict(optimizer.state_dict())
    model3,TrainAcc,TestOODAcc,TestIDAcc,train_mask2=train(training_params,data,meta,model=model2,earlystop=False,train_mask=train_mask,optimizer=copied_optimizer,device=device)
    LTrainAcc.append(np.array(TrainAcc))
    LTestOODAcc.append(np.array(TestOODAcc))
    LTestIDAcc.append(np.array(TestIDAcc))
  AvgTrainAcc=np.mean(np.array(LTrainAcc),axis=0)
  AvgTestOODAcc=np.mean(np.array(LTestOODAcc),axis=0)
  AvgTestIDAcc=np.mean(np.array(LTestIDAcc),axis=0)
  return(AvgTrainAcc,AvgTestOODAcc,AvgTestIDAcc)

In [None]:
n_runs=1
hid_dim=[16,32,64,128,256,512]
IDTrainDiff=[]
OODTrainDiff=[]
for h in hid_dim:
  training_params["hid_dim"]=h
  AvgTrainAcc,AvgTestOODAcc,AvgTestIDAcc=average_runs(n_runs,good_cora_time, meta_cora_time)
  IDTrainDiff.append(np.array(AvgTrainAcc)-np.array(AvgTestIDAcc))
  OODTrainDiff.append(np.array(AvgTrainAcc)-np.array(AvgTestOODAcc))

In [None]:
plt.rcParams.update({
    "font.family": "serif",
    "font.size": 12,
    "axes.labelsize": 14,
    "axes.titlesize": 14,
    "legend.fontsize": 12,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "figure.dpi": 300,
    "lines.linewidth": 1.5
})

In [None]:
X=[i for i in range (0,len(IDTrainDiff[0]))]
for i in range (0,len(IDTrainDiff)):
  plt.plot(X,IDTrainDiff[i],label=f"hid_dim : {hid_dim[i]}")
plt.legend()
plt.title(f"CORA Dataset")
plt.xlabel("Epochs")
plt.ylabel("Train-Test accuracy [%]")
plt.tight_layout()
plt.grid()
plt.xlim(0,175)
plt.ylim(-0.05,0.29)
plt.show()

In [None]:
X=[i for i in range (0,len(OODTrainDiff[0]))]
for i in range (0,len(OODTrainDiff)):
  plt.plot(X,OODTrainDiff[i],label=f"hid_dim : {hid_dim[i]}")
plt.legend()
plt.title(f"CORA Dataset")
plt.xlabel("Epochs")
plt.ylabel("Train-Test Accuracy [%]")
plt.tight_layout()
plt.grid()
plt.xlim(0,175)
plt.show()