# Graph neural networks



In [None]:
import os

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.data.separate import separate
from torch_geometric.datasets import Planetoid, ZINC
from torch_geometric.loader import DataLoader, LinkNeighborLoader
from torch_geometric.utils import scatter
from torch_geometric.nn import GCNConv, GATConv, TransformerConv
from torch_geometric.utils import to_dense_adj

import plotly.express as px

## Data for node prediction

A now classic dataset for GNNs (one whose use is also discouraged within some circles as it is very easy to overfit on), Cora is a nice small dataset to start looking at GNNs. There are many variations of the Cora dataset originally presented in "Automating the Construction of Internet Portals with Machine Learning" by McCallum et al. (https://link.springer.com/article/10.1023/A:1009953814988).

We will use the Cora dataset variant as presented in “FastGCN: Fast Learning with Graph Convolutional Networks via Importance Sampling” (https://arxiv.org/abs/1801.10247). It describes a citation network of 2708 papers and our task is classify each paper into one of 7 different categories.

In [None]:
# Load dataset:
data_dir = "./cora_data_full"
try: 
    os.mkdir(data_dir)
except FileExistsError:
    print(data_dir, 'exists')
    
dataset = Planetoid(data_dir, 'Cora', split="full")

Split into training, validation and test nodes.

In [None]:
# train-test split
train_x = data.x[data.train_mask]
train_y = data.y[data.train_mask]

valid_x = data.x[data.val_mask]
valid_y = data.y[data.val_mask]

test_x = data.x[data.test_mask]
test_y = data.y[data.test_mask]

Describe the training data. 

## Baseline: feed-forward neural network

In [None]:
class FFNN(nn.Module):
    """A simple feed forward neural network with no hidden layers

    Args:
        input_dim (int): Dimensionality of the input feature vectors
        output_dim (int): Dimensionality of the output softmax distribution
    """
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.layer_1 = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.layer_1(x)
        return F.log_softmax(x, dim=1)

In [None]:
n_epochs = 100
model = FFNN(input_dim=train_x.shape[-1], output_dim=7)
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

val_accuracy = {key: 0 for key in range(n_epochs)}

for epoch in range(n_epochs):
    model.train()
    optimiser.zero_grad()
    y_hat = model(train_x)
    loss = F.cross_entropy(y_hat, train_y)
    loss.backward()
    optimiser.step()
    
    model.eval()
    y_hat = model(valid_x)
    y_hat = y_hat.data.max(1)[1]
    num_correct = y_hat.eq(valid_y.data).sum()
    num_total = len(valid_y)
    val_accuracy[epoch] = 100.0 * (num_correct.detach().numpy()/num_total)

In [None]:
fig = px.scatter(x=range(n_epochs), y=np.array(list(val_accuracy.values())))

fig.update_layout(template="plotly_white", 
                  xaxis_title="epoch",
                  yaxis_title="validation accuracy (in %)")

fig.show()

## Do it yourself graph neural network

The generic equation for convolutional GNN is
$$
\mathbf{h_i} = \phi \big(\mathbf{x_i}, \oplus_{j \in \mathcal{N}_i} c_{i,j} \psi (\mathbf{x_j}) \big).
$$
Rewrite this in terms of adjacency matrix $\mathbf{A}$ and implement it.

In [None]:
class GCLayer(nn.Module):
    """Graph convolution layer

    Args:
        input_dim (int): Dimensionality of the input feature vectors
        output_dim (int): Dimensionality of the output softmax distribution
        A (torch.Tensor): 2-D adjacency matrix
    """
    def __init__(self, input_dim, output_dim, A):
        super(GCNLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.A = A

        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        ### TODO
        
        return x


In [None]:
class GNN(nn.Module):
    """GNN model

    Args:
        input_dim (int): Dimensionality of the input feature vectors
        output_dim (int): Dimensionality of the output softmax distribution
        A (torch.Tensor): 2-D adjacency matrix
    """
    def __init__(self, input_dim, output_dim, A):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.layer = GCLayer(input_dim, output_dim, A)

    def forward(self, x):
        x = self.layer(x)
        return F.log_softmax(x, dim=1)

In [None]:
# adjacency matrix
adj = to_dense_adj(data.edge_index)[0]

Trin the your graph neural network and plot the validation accuracies. 

## Do it yourself graph neural network - normalization

The generic equation for convolutional GNN is
$$
\mathbf{h_i} = \phi \big(\mathbf{x_i}, \oplus_{j \in \mathcal{N}_i} c_{i,j} \psi (\mathbf{x_j}) \big).
$$

Specific implementations often differ slightly from this form. For example Kipf and Welling's "Semi-Supervised Classification with Graph Convolutional Networks" (https://arxiv.org/abs/1609.02907) employs a symmetric normalisation for the convolution coefficients with a re-normalisation to tackle exploding parameters. Here

$$
\mathbf{H} = \sigma \big( \mathbf{\tilde{D}}^{-\frac{1}{2}} \mathbf{\tilde{A}} \mathbf{\tilde{D}}^{-\frac{1}{2}} \mathbf{X} \mathbf{W} \big),
$$

where $\mathbf{\tilde{A}} = \mathbf{A} + \mathbf{I}$ and $\mathbf{\tilde{D}}$ is the degree matrix of $\mathbf{\tilde{A}}$.

Is the behaviour of the generic, convolutional GNN layer the same as Kipf and Welling's GCN? Implement this version.  How does the performance compare to the generic version above?

## Standard graph neural network layers

Have a look at some standard implementations of graph neural network layers, for example `GCNConv`, `GATConv` and `TransformerConv`. 

Note that these do not use the adjacency matrix, but the edge list (`data.edge_index`). 

Implement the models and show their validation accuracies during training. 

## Data for graph-level prediction

For this, we will use the ZINC dataset for graph-regression. It contains about 12 000 molecular graphs with up to 38 nodes each and the task is to predict for each molecule the solubility (a scalar number).

In [None]:
# Load dataset:
data_dir = "./zinc_data"
try: 
    os.mkdir(data_dir)
except FileExistsError:
    print(data_dir, 'exists')

train_zinc_dataset = ZINC(root=data_dir, split='train', subset=True)
val_zinc_dataset = ZINC(root=data_dir, split='val', subset=True)
test_zinc_dataset = ZINC(root=data_dir, split='test', subset=True)

Describe this dataset. 

### GNN for graph-level prediction

Write a GNN for graph-level prediction and use it to predict the molecule solubility. 

Note: Using batch-sizes larger than `1` requires that you aggregate over the individual graphs, not the entire batch. To do this, I have used the following `extract`-function. 

In [None]:
def extract(x, idx, slices):
    """ Extract x for graph with index idx
    Args: 
        x: data
        idx: index of graph in batch
        slices: indices where graphs start and end, e.g. batch.ptr
    """
    start, end = int(slices[idx]), int(slices[idx + 1])
    return x.narrow(0, start, end-start).sum(axis=0).reshape(1, -1)