In [1]:
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import DataLoader, NeighborLoader
from torch.utils.data import random_split

In [None]:
from model import SimpleGAT, BenchmarkGAT
from train import train_classifier

# What is the SC-GAT layer?

The usual for of a GAT is defined by the following equations:

$$
e_{ij} = a(\mathbf{W}_1\mathbf{h}_i|| \mathbf{W}_2 \mathbf{h_j}),
$$

where the $\mathbf{h}_i$ are the node vectors and the $\mathbf{W}_i$ are weight matrices. In the original implementation $\mathbf{W}_1 = \mathbf{W}_2 = \mathbf{W}$.
We then calculate the attention weights with

$$
\alpha_{ij} = softmax(e_{ij})
$$

We call "a" an attentional mechanism which is usually just a plain simple single layer NN.

From the first equation we have to possible directions, which is "simplification", which leads to GATv2Conv, where we dont concatenate, we only calculate $\mathbf{x}_i+\mathbf{x}_j$.
Which is already close to the next idea. We could include $\mathbf{x}_i-\mathbf{x}_j$ into our attention mechanism. As mentioned in the GAT paper, the general GAT model loses every sense of structure, and every node attends over every node. So we are looking for a way to retain local and global structures, just like EdgeConv did, but with the power of attention.

So the idea is to change the first equation to this:

$$
e_{ij} = a(\mathbf{W}[\mathbf{h}_i^T||\mathbf{h}_j^T||\mathbf{h}_i^T-\mathbf{h}_j^T]^T)
$$
Here $||$ denotes concatenation.


# Performance on Planetoid datasets

In [4]:
train_dataset = Planetoid(root="Planetoid/PubMed/", name="PubMed", split="full", num_val=500)

In [5]:
train_loader = NeighborLoader(train_dataset[0], input_nodes=train_dataset[0].train_mask,
                              num_neighbors=[50, 50, 50], shuffle=True, batch_size=64)
val_loader =NeighborLoader(train_dataset[0], input_nodes=train_dataset[0].val_mask,
                              num_neighbors=[50, 50], shuffle=True, batch_size=20)

loaders = {"train": train_loader, "val": val_loader}

In [6]:
model = SimpleGAT(in_channels=500, out_channels = 3 ,heads=4, hidden_size=100).to("cuda")

In [7]:
#test pass trough model
d = next(iter(train_loader))
d

Data(x=[7581, 500], edge_index=[2, 20504], y=[7581], train_mask=[7581], val_mask=[7581], test_mask=[7581], n_id=[7581], e_id=[20504], num_sampled_nodes=[4], num_sampled_edges=[3], input_id=[64], batch_size=64)

In [8]:
o = model(d.to("cuda"))
o.shape

torch.Size([7581, 3])

In [9]:
loss_fn = torch.nn.CrossEntropyLoss()

In [10]:
optim = torch.optim.Adadelta(model.parameters(), lr=1e-2)

In [None]:
train_classifier(model, optim, loaders, loss_fn, 100, print_every=10, test_every=5)

Epoch 0, Train Loss: 1.08
Epoch 0, Accuracy: 45.36%
Epoch 5, Accuracy: 47.91%
Epoch 10, Train Loss: 0.99
Epoch 10, Accuracy: 67.09%
Epoch 15, Accuracy: 71.46%


## Compare to baseline GAT model

In [None]:
model = BenchmarkGAT(in_channels=500, out_channels = 3 ,heads=4, hidden_size=100).to("cuda")

In [None]:
optim = torch.optim.Adam(model.parameters(), lr=1e-2)

In [None]:
train_classifier(model, optim, loaders, loss_fn, 100, print_every=10, test_every=5)