### Node Level Comparative Study

Experiments

1. Effect of Increasing Depth

2. Effect of attention:
2A. Node Classification
2B Graph Glassification -> Notebook: Graph Classification Comparative Study

3. Comparison between GNN (GCn, GAT) and MLP ->  Notebook: SupervisedGNN

4. Effect of Changing Dropout

Dataset:
Cora

#### Experiments 1 & 2

In [1]:
## Standard libraries
import os
import json
import math
import numpy as np
import time

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()
sns.set()

## Progress bar
from tqdm.notebook import tqdm

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
# Torchvision
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms
# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = "../data"

# Setting the seed
pl.seed_everything(85)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

Global seed set to 85


cpu


In [2]:
import torch_geometric
import torch_geometric.nn as geom_nn
import torch_geometric.data as geom_data

In [3]:
gnn_layer_by_name = {
    "GCN": geom_nn.GCNConv,
    "GAT": geom_nn.GATConv,
    "GraphConv": geom_nn.GraphConv
}

In [4]:
class GNNModel(nn.Module):

    def __init__(self, c_in, c_hidden, c_out, num_layers=2, layer_name="GCN", dp_rate=0.1, **kwargs):
        """
        Inputs:
            c_in - Dimension of input features
            c_hidden - Dimension of hidden features
            c_out - Dimension of the output features. Usually number of classes in classification
            num_layers - Number of "hidden" graph layers
            layer_name - String of the graph layer to use
            dp_rate - Dropout rate to apply throughout the network
            kwargs - Additional arguments for the graph layer (e.g. number of heads for GAT)
        """
        super().__init__()
        gnn_layer = gnn_layer_by_name[layer_name]

        layers = []
        in_channels, out_channels = c_in, c_hidden
        for l_idx in range(num_layers-1):
            layers += [
                gnn_layer(in_channels=in_channels,
                          out_channels=out_channels,
                          **kwargs),
                nn.ReLU(inplace=True),
                nn.Dropout(dp_rate)
            ]
            in_channels = c_hidden
        layers += [gnn_layer(in_channels=in_channels,
                             out_channels=c_out,
                             **kwargs)]
        self.layers = nn.ModuleList(layers)

    def forward(self, x, edge_index):
        """
        Inputs:
            x - Input features per node
            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
        """
        for l in self.layers:
            # For graph layers, we need to add the "edge_index" tensor as additional input
            # All PyTorch Geometric graph layer inherit the class "MessagePassing", hence
            # we can simply check the class type.
            if isinstance(l, geom_nn.MessagePassing):
                x = l(x, edge_index)
            else:
                x = l(x)
        return x

In [5]:
class GNNModelFlexible(nn.Module):

    def __init__(self, c_in, c_hidden, c_out, num_layers, layer_name="GCN", dp_rate=0.1, **kwargs):
        """
        Inputs:
            c_in - Dimension of input features
            c_hidden - Dimension of hidden features
            c_out - Dimension of the output features. Usually number of classes in classification
            num_layers - Number of "hidden" graph layers
            layer_name - String of the graph layer to use
            dp_rate - Dropout rate to apply throughout the network
            kwargs - Additional arguments for the graph layer (e.g. number of heads for GAT)
        """
        super().__init__()
        gnn_layer = gnn_layer_by_name[layer_name]

        layers = []
        in_channels = c_in
        for l_idx in range(num_layers-1):
            if l_idx == num_layers-1:
                out_channels = c_out
            else:
                out_channels = c_hidden[l_idx]
            layers += [
                gnn_layer(in_channels=in_channels,
                          out_channels=out_channels,
                          **kwargs),
                nn.ReLU(inplace=True),
                nn.Dropout(dp_rate)
            ]
            in_channels = out_channels
        layers += [gnn_layer(in_channels=in_channels,
                             out_channels=c_out,
                             **kwargs)]
        self.layers = nn.ModuleList(layers)

    def forward(self, x, edge_index):
        """
        Inputs:
            x - Input features per node
            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
        """
        for l in self.layers:
            # For graph layers, we need to add the "edge_index" tensor as additional input
            # All PyTorch Geometric graph layer inherit the class "MessagePassing", hence
            # we can simply check the class type.
            if isinstance(l, geom_nn.MessagePassing):
                x = l(x, edge_index)
            else:
                x = l(x)
        return x

In [6]:
class NodeLevelGNN(pl.LightningModule):

    def __init__(self, model_name, **model_kwargs):
        super().__init__()
        # Saving hyperparameters
        self.save_hyperparameters()

        if model_name == "MLP":
            self.model = MLPModel(**model_kwargs)
        elif num_layers == 2:
            self.model = GNNModel(**model_kwargs)     
        else:
            self.model = GNNModelFlexible(**model_kwargs)
        self.loss_module = nn.CrossEntropyLoss()

    def forward(self, data, mode="train"):
        x, edge_index = data.x, data.edge_index
        x = self.model(x, edge_index)

        # Only calculate the loss on the nodes corresponding to the mask
        if mode == "train":
            mask = data.train_mask
        elif mode == "val":
            mask = data.val_mask
        elif mode == "test":
            mask = data.test_mask
        else:
            assert False, f"Unknown forward mode: {mode}"

        loss = self.loss_module(x[mask], data.y[mask])
        acc = (x[mask].argmax(dim=-1) == data.y[mask]).sum().float() / mask.sum()
        return loss, acc

    def configure_optimizers(self):
        # We use SGD here, but Adam works as well
        optimizer = optim.SGD(self.parameters(), lr=0.1, momentum=0.9, weight_decay=2e-3)
        return optimizer

    def training_step(self, batch, batch_idx):
        loss, acc = self.forward(batch, mode="train")
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    def validation_step(self, batch, batch_idx):
        _, acc = self.forward(batch, mode="val")
        self.log('val_acc', acc)

    def test_step(self, batch, batch_idx):
        _, acc = self.forward(batch, mode="test")
        self.log('test_acc', acc)

In [7]:
def train_node_classifier(model_name, dataset, **model_kwargs):
    pl.seed_everything(42)
    node_data_loader = geom_data.DataLoader(dataset, batch_size=1)

    # Create a PyTorch Lightning trainer with the generation callback
    root_dir = os.path.join(CHECKPOINT_PATH, "NodeLevel" + model_name)
    os.makedirs(root_dir, exist_ok=True)
    trainer = pl.Trainer(default_root_dir=root_dir,
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
                         gpus=1 if str(device).startswith("cuda") else 0,
                         max_epochs=200,
                         progress_bar_refresh_rate=0) # 0 because epoch size is 1
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

    pl.seed_everything()
    print("Number of Layers = ", num_layers)
    model = NodeLevelGNN(model_name=model_name, c_in=dataset.num_node_features, c_out=dataset.num_classes, **model_kwargs)   
    trainer.fit(model, node_data_loader, node_data_loader)
    model = NodeLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # Test best model on the test set
    test_result = trainer.test(model, test_dataloaders=node_data_loader, verbose=False)
    batch = next(iter(node_data_loader))
    batch = batch.to(model.device)
    _, train_acc = model.forward(batch, mode="train")
    _, val_acc = model.forward(batch, mode="val")
    result = {"train": train_acc,
              "val": val_acc,
              "test": test_result[0]['test_acc']}
    return model, result

In [8]:
# Small function for printing the test scores
def print_results(result_dict):
    if "train" in result_dict:
        print(f"Train accuracy: {(100.0*result_dict['train']):4.2f}%")
    if "val" in result_dict:
        print(f"Val accuracy:   {(100.0*result_dict['val']):4.2f}%")
    print(f"Test accuracy:  {(100.0*result_dict['test']):4.2f}%")

In [9]:
cora_dataset = torch_geometric.datasets.Planetoid(root=DATASET_PATH, name="Cora")

In [25]:
target_model = ["GCN","GAT","GraphConv"]
all_result = []
for selected_model in target_model:
    for num_layers in range(3,6):
        print(selected_model," hiddenlayer = ", num_layers)
        CHECKPOINT_PATH = "../saved_models/anishnarkarsayswhat/layer"+str(num_layers)
        node_gnn_model, node_gnn_result = train_node_classifier(model_name=selected_model,
                                                            layer_name=selected_model,
                                                            dataset=cora_dataset,
                                                            c_hidden = [700,350,175,80,40],
                                                            num_layers=num_layers,
                                                            dp_rate=0.1)
        all_result = all_result + [node_gnn_result]

Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42
Missing logger folder: ..\saved_models\anishnarkarsayswhat\layer3\NodeLevelGAT\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.3 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.015     Total estimated model params size (MB)


GAT  hiddenlayer =  3
Number of Layers =  3


Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42
Missing logger folder: ..\saved_models\anishnarkarsayswhat\layer4\NodeLevelGAT\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.3 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.257     Total estimated model params size (MB)


GAT  hiddenlayer =  4
Number of Layers =  4


Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.3 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.311     Total estimated model params size (MB)


GAT  hiddenlayer =  5
Number of Layers =  5


Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42


GraphConv  hiddenlayer =  3
Number of Layers =  3


Missing logger folder: ..\saved_models\anishnarkarsayswhat\layer3\NodeLevelGraphConv\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 2.5 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
2.5 M     Trainable params
0         Non-trainable params
2.5 M     Total params
10.009    Total estimated model params size (MB)
Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42


GraphConv  hiddenlayer =  4
Number of Layers =  4


Missing logger folder: ..\saved_models\anishnarkarsayswhat\layer4\NodeLevelGraphConv\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 2.6 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.490    Total estimated model params size (MB)
Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42


GraphConv  hiddenlayer =  5
Number of Layers =  5



  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 2.6 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.597    Total estimated model params size (MB)
Global seed set to 42


In [26]:
all_result

[{'train': tensor(0.9786), 'val': tensor(0.7660), 'test': 0.8050000071525574},
 {'train': tensor(0.9500), 'val': tensor(0.7880), 'test': 0.8059999942779541},
 {'train': tensor(0.9429), 'val': tensor(0.7740), 'test': 0.8040000200271606},
 {'train': tensor(0.1357), 'val': tensor(0.3080), 'test': 0.3190000057220459},
 {'train': tensor(0.1429), 'val': tensor(0.1560), 'test': 0.14399999380111694},
 {'train': tensor(0.1429), 'val': tensor(0.1560), 'test': 0.14399999380111694}]

### Effect of dropout

In [27]:
target_model = ["GCN"]
num_layers = 3
dp_rates=[0.1,0.25,0.40,0.55,0.70]
dp_all_result = []
for selected_model in target_model:
    for dp_rate in dp_rates:
        print(" Dropout = ", dp_rate)
        CHECKPOINT_PATH = "../saved_models/anishnarkarsayswhat/dropout"+str(dp_rate)
        node_gnn_model, node_gnn_result = train_node_classifier(model_name=selected_model,
                                                            layer_name=selected_model,
                                                            dataset=cora_dataset,
                                                            c_hidden = [700,40],
                                                            num_layers=num_layers,
                                                            dp_rate=dp_rate)
        dp_all_result = dp_all_result + [node_gnn_result]

Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42
Missing logger folder: ..\saved_models\anishnarkarsayswhat\dropout0.1\NodeLevelGCN\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.0 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.129     Total estimated model params size (MB)


 Dropout =  0.1
Number of Layers =  3


Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42
Missing logger folder: ..\saved_models\anishnarkarsayswhat\dropout0.25\NodeLevelGCN\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.0 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.129     Total estimated model params size (MB)


 Dropout =  0.25
Number of Layers =  3


Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42
Missing logger folder: ..\saved_models\anishnarkarsayswhat\dropout0.4\NodeLevelGCN\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.0 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.129     Total estimated model params size (MB)


 Dropout =  0.4
Number of Layers =  3


Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42


 Dropout =  0.55
Number of Layers =  3


Missing logger folder: ..\saved_models\anishnarkarsayswhat\dropout0.55\NodeLevelGCN\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.0 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.129     Total estimated model params size (MB)
Global seed set to 42
Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 42
Missing logger folder: ..\saved_models\anishnarkarsayswhat\dropout0.7\NodeLevelGCN\lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModelFlexible | 1.0 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
1.0 M     Trainable params
0  

 Dropout =  0.7
Number of Layers =  3


Global seed set to 42


In [28]:
dp_all_result

[{'train': tensor(0.9714), 'val': tensor(0.7740), 'test': 0.8109999895095825},
 {'train': tensor(0.9857), 'val': tensor(0.7440), 'test': 0.8149999976158142},
 {'train': tensor(0.9786), 'val': tensor(0.7700), 'test': 0.8059999942779541},
 {'train': tensor(0.9929), 'val': tensor(0.7500), 'test': 0.8019999861717224},
 {'train': tensor(0.9786), 'val': tensor(0.7100), 'test': 0.8050000071525574}]