In [1]:
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import DataLoader, NeighborLoader
from torch.utils.data import random_split

In [2]:
from model import SimpleGAT, BenchmarkGAT
from train import train_classifier

# What is the SC-GAT layer?

The usual for of a GAT is defined by the following equations:

$$
e_{ij} = a(\mathbf{W}_1\mathbf{h}_i|| \mathbf{W}_2 \mathbf{h_j}),
$$

where the $\mathbf{h}_i$ are the node vectors and the $\mathbf{W}_i$ are weight matrices. In the original implementation $\mathbf{W}_1 = \mathbf{W}_2 = \mathbf{W}$.
We then calculate the attention weights with

$$
\alpha_{ij} = softmax(e_{ij})
$$

We call "a" an attentional mechanism which is usually just a plain simple single layer NN.

From the first equation we have to possible directions, which is "simplification", which leads to GATv2Conv, where we dont concatenate, we only calculate $\mathbf{x}_i+\mathbf{x}_j$.
Which is already close to the next idea. We could include $\mathbf{x}_i-\mathbf{x}_j$ into our attention mechanism. As mentioned in the GAT paper, the general GAT model loses every sense of structure, and every node attends over every node. So we are looking for a way to retain local and global structures, just like EdgeConv did, but with the power of attention.

So the idea is to change the first equation to this:

$$
e_{ij} = a(\mathbf{W}[\mathbf{h}_i^T||\mathbf{h}_j^T||\mathbf{h}_i^T-\mathbf{h}_j^T]^T)
$$
Here $||$ denotes concatenation.


# Performance on Planetoid datasets

In [3]:
train_dataset = Planetoid(root="Planetoid/PubMed/", name="PubMed", split="full", num_val=500)

In [4]:
train_loader = NeighborLoader(train_dataset[0], input_nodes=train_dataset[0].train_mask,
                              num_neighbors=[50, 50, 50], shuffle=True, batch_size=64)
val_loader =NeighborLoader(train_dataset[0], input_nodes=train_dataset[0].val_mask,
                              num_neighbors=[50, 50], shuffle=True, batch_size=20)

loaders = {"train": train_loader, "val": val_loader}

In [8]:
model1 = SimpleGAT(in_channels=500, out_channels = 3 ,heads=4, hidden_size=100).to("cuda")

In [9]:
#test pass trough model
d = next(iter(train_loader))
d

Data(x=[10754, 500], edge_index=[2, 35189], y=[10754], train_mask=[10754], val_mask=[10754], test_mask=[10754], n_id=[10754], e_id=[35189], num_sampled_nodes=[4], num_sampled_edges=[3], input_id=[64], batch_size=64)

In [11]:
o = model1(d.to("cuda"))
o.shape

torch.Size([10754, 3])

In [12]:
loss_fn = torch.nn.CrossEntropyLoss()

In [13]:
optim = torch.optim.Adadelta(model1.parameters(), lr=1e-2)

In [11]:
train_classifier(model, optim, loaders, loss_fn, 100, print_every=10, test_every=5)

Epoch 0, Train Loss: 1.08
Epoch 0, Accuracy: 45.36%
Epoch 5, Accuracy: 47.91%
Epoch 10, Train Loss: 0.99
Epoch 10, Accuracy: 67.09%
Epoch 15, Accuracy: 71.46%
Epoch 20, Train Loss: 0.92
Epoch 20, Accuracy: 69.36%
Epoch 25, Accuracy: 69.85%
Epoch 30, Train Loss: 0.84
Epoch 30, Accuracy: 69.82%
Epoch 35, Accuracy: 71.38%
Epoch 40, Train Loss: 0.76
Epoch 40, Accuracy: 72.07%
Epoch 45, Accuracy: 73.35%
Epoch 50, Train Loss: 0.7
Epoch 50, Accuracy: 74.27%
Epoch 55, Accuracy: 74.7%
Epoch 60, Train Loss: 0.65
Epoch 60, Accuracy: 75.48%
Epoch 65, Accuracy: 76.7%
Epoch 70, Train Loss: 0.61
Epoch 70, Accuracy: 77.57%
Epoch 75, Accuracy: 78.17%
Epoch 80, Train Loss: 0.57
Epoch 80, Accuracy: 79.1%
Epoch 85, Accuracy: 79.75%
Epoch 90, Train Loss: 0.55
Epoch 90, Accuracy: 81.11%
Epoch 95, Accuracy: 81.3%


In [14]:
model.load_state_dict(torch.load("scgat.pt"))

<All keys matched successfully>

## Compare to baseline GAT model

In [15]:
model2 = BenchmarkGAT(in_channels=500, out_channels = 3 ,heads=4, hidden_size=100).to("cuda")

In [20]:
optim = torch.optim.Adadelta(model2.parameters(), lr=1e-2)

In [None]:
train_classifier(model2, optim, loaders, loss_fn, 100, print_every=1, test_every=5)

Epoch 0, Train Loss: 1.09
Epoch 0, Accuracy: 46.34%
