<a href="https://colab.research.google.com/github/batu-el/l65_be301_dc755/blob/main/Notebook3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [1]:
!pip install dgl torch_geometric torch

# Install required python libraries
import os

# Install PyTorch Geometric and other libraries
if 'IS_GRADESCOPE_ENV' not in os.environ:
    print("Installing PyTorch Geometric")
    !pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
    !pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
    !pip install -q torch-geometric
    print("Installing other libraries")
    !pip install networkx
    !pip install lovely-tensors

Installing PyTorch Geometric
Installing other libraries


In [2]:
import os
import sys
import time
import math
import random
import itertools
from datetime import datetime
from typing import Mapping, Tuple, Sequence, List

import pandas as pd
import networkx as nx
import numpy as np
import scipy as sp

from tqdm.notebook import tqdm

import torch
import torch.nn.functional as F
from torch.nn import Embedding, Linear, ReLU, BatchNorm1d, LayerNorm, Module, ModuleList, Sequential
from torch.nn import TransformerEncoder, TransformerEncoderLayer, MultiheadAttention
from torch.optim import Adam

import torch_geometric
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader
from torch_geometric.datasets import Planetoid

import torch_geometric.transforms as T
from torch_geometric.utils import remove_self_loops, dense_to_sparse, to_dense_batch, to_dense_adj

from torch_geometric.nn import GCNConv, GATConv

from torch_scatter import scatter, scatter_mean, scatter_max, scatter_sum

import lovely_tensors as lt
lt.monkey_patch()

import matplotlib.pyplot as plt
import seaborn as sns

# import warnings
# warnings.filterwarnings("ignore", category=RuntimeWarning)
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

print("All imports succeeded.")
print("Python version {}".format(sys.version))
print("PyTorch version {}".format(torch.__version__))
print("PyG version {}".format(torch_geometric.__version__))

All imports succeeded.
Python version 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
PyTorch version 2.1.0+cu121
PyG version 2.4.0


# Overview

In [3]:
# ## Outline ###

# STEP 1. - Datasets

# 1.1 Synthetic Datasets
# 1.1.1 Homophilic Node Classification
# 1.1.2 Heterophilic Node Classification
# 1.1.3 Homophilic Graph Classification
# 1.1.4 Heterophilic Graph Classification

# 1.2 Real Datasets
# 1.2.1 Homophilic Node Classification - Cora
# 1.2.2 Heterophilic Node Classification - Texas
# 1.2.3 Homophilic Graph Classification - QM9
# 1.2.4 Heterophilic Graph Classification - (?)

# STEP 2. Models

# 2.1 Baselines to Compare Model Accuracies
# 2.1.1 GCN
# 2.1.2 Sparse Transformer
# 2.1.3 MPNN
# 2.1.4 Dense Transformer with Attention Mask
# 2.1.5 Dense Transformer with Positional Encodings

# 2.2 Comparison of 2 Models: Dense (w/ PosEnc) & Sparse Transformer
# 2.2.1 1 Head 1 Layer
# 2.2.1 4 Head 1 Layer
# 2.2.1 1 Head 3 Layer
# 2.2.1 4 Head 3 Layer

# STEP 3. Evaluation

# Comparisons:
# A: Adjacency vs Sparse Attention
# B: Adjacency vs Dense Attention
# C: Sparse Attention vs Dense Attention

# 3.1 Combining Multiple Attention Matrices from 2.2
# 3.1.1 If Edge Exists
# 3.1.2 PCA

# 3.2 1D (Vector) Similarity Comparison
# 3.2.1 Node Degree (histogram)
# 3.2.2 Substructures (histogram)

# 3.3 2D (Matrix) Similarity Comparison
# 3.3.1 Adjacency Matrix (Graph Edit Dist & Kernel 1 WL)
# 3.3.2 Shortest Path (Graph Edit Dist & Kernel 1 WL)

# STEP 4. Discussion
# Note: Future research can look at how attention evolves over the course of training


# Synthetic Dataset Generation

In [3]:
import torch
from torch_geometric.data import Data
import numpy as np

def preprocess(data, train_ratio = 0.7, val_ratio = 0.15, test_ratio = 0.15):
    g = dataset[0]
    y = g.ndata['label']
    feat = g.ndata['feat']

    num_nodes = len(y)
    indices = torch.randperm(num_nodes)

    num_train, num_val = int(num_nodes * train_ratio), int(num_nodes * val_ratio)
    num_test = num_nodes - num_train - num_val

    train_mask, val_mask, test_mask = torch.zeros(num_nodes, dtype=torch.bool), torch.zeros(num_nodes, dtype=torch.bool), torch.zeros(num_nodes, dtype=torch.bool)
    train_mask[indices[:num_train]] = True
    val_mask[indices[num_train:num_train+num_val]] = True
    test_mask[indices[num_train+num_val:]] = True

    # Convert NetworkX graph to edge list
    src, dst = g.edges()
    edge_list = list(zip(src.tolist(), dst.tolist()))
    # Create a set for symmetric edges to avoid duplicates
    symmetric_edges = set()

    # Add each edge and its reverse to the set
    for u, v in edge_list:
        symmetric_edges.add((u, v))
        symmetric_edges.add((v, u))
    edge_list = list(symmetric_edges)

    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    node_features = feat #[g.nodes[node]['feat'] for node in G.nodes()]
    # Create a Data object
    if len(np.array(node_features).shape) == 1:
      data = Data(x=torch.tensor(np.array(node_features)).unsqueeze(1), edge_index=torch.tensor(np.array(edge_index)), y=torch.tensor(np.array(y)), train_mask=torch.tensor(np.array(train_mask)), val_mask=torch.tensor(np.array(val_mask)), test_mask=torch.tensor(np.array(test_mask)))
    else:
      data = Data(x=torch.tensor(np.array(node_features, dtype=float)).float(), edge_index=torch.tensor(np.array(edge_index)), y=torch.tensor(np.array(y)), train_mask=torch.tensor(np.array(train_mask)), val_mask=torch.tensor(np.array(val_mask)), test_mask=torch.tensor(np.array(test_mask)))
    return data

from dgl.data import BACommunityDataset
dataset = BACommunityDataset(num_base_nodes=160,
                             num_base_edges_per_node=4,
                             num_motifs=80,
                             perturb_ratio=0.01,
                             num_inter_edges=350,
                             seed=None,
                             raw_dir=None,
                             force_reload=True,
                             verbose=True,
                             transform=None)

dataset.num_classes

g = dataset[0]
label = g.ndata['label']
feat = g.ndata['feat']

Done saving data into cached files.


In [4]:
data = preprocess(dataset)

# Models

In [5]:
# PyG example code: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/gcn2_cora.py

class GNNModel(Module):

    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            hidden_dim: int = 128,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)

        self.layers = ModuleList()
        for layer in range(num_layers):
            self.layers.append(
                # GCNConv(hidden_dim, hidden_dim)
                GATConv(hidden_dim, hidden_dim // num_heads, num_heads)
            )
        self.dropout = dropout

    def forward(self, x, edge_index):

        x = self.lin_in(x)

        for layer in self.layers:
            # conv -> activation ->  dropout -> residual
            x_in = x
            x = layer(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

        x = self.lin_out(x)

        return x.log_softmax(dim=-1)


class SparseGraphTransformerModel(Module):
    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            hidden_dim: int = 128,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)

        self.layers = ModuleList()
        for layer in range(num_layers):
            self.layers.append(
                MultiheadAttention(
                    embed_dim = hidden_dim,
                    num_heads = num_heads,
                    dropout = dropout
                )
            )
        self.dropout = dropout

    def forward(self, x, dense_adj):

        x = self.lin_in(x)

        # TransformerEncoder
        # x = self.encoder(x, mask = ~dense_adj.bool())

        self.attn_weights_list = []

        for layer in self.layers:
            # # TransformerEncoderLayer
            # # boolean mask enforces graph structure
            # x = layer(x, src_mask = ~dense_adj.bool())

            # MHSA layer
            # boolean mask enforces graph structure
            x_in = x
            x, attn_weights = layer(
                x, x, x,
                attn_mask = ~dense_adj.bool(),
                average_attn_weights = False
            )
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

            self.attn_weights_list.append(attn_weights)

        x = self.lin_out(x)

        return x.log_softmax(dim=-1)

class DenseGraphTransformerModel(Module):

    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            pos_enc_dim: int = 16,
            hidden_dim: int = 128,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_pos_enc = Linear(pos_enc_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)

        self.layers = ModuleList()
        for layer in range(num_layers):
            self.layers.append(
                MultiheadAttention(
                    embed_dim = hidden_dim,
                    num_heads = num_heads,
                    dropout = dropout
                )
            )


        self.attn_bias_scale = torch.nn.Parameter(torch.tensor([10.0]))  # controls how much we initially bias our model to nearby nodes
        self.dropout = dropout

    def forward(self, x, pos_enc, dense_sp_matrix):

        # x = self.lin_in(x) + self.lin_pos_enc(pos_enc)
        x = self.lin_in(x)  # no node positional encoding

        # attention bias
        # [i, j] -> inverse of shortest path distance b/w node i and j
        # diagonals -> self connection, set to 0
        # disconnected nodes -> -1
        attn_bias = self.attn_bias_scale * torch.nan_to_num(
            (1 / (torch.nan_to_num(dense_sp_matrix, nan=-1, posinf=-1, neginf=-1))),
            nan=0, posinf=0, neginf=0
        )
        #attn_bias = torch.ones_like(attn_bias)

        # TransformerEncoder
        # x = self.encoder(x, mask = attn_bias)

        self.attn_weights_list = []

        for layer in self.layers:
            # # TransformerEncoderLayer
            # # float mask adds learnable additive attention bias
            # x = layer(x, src_mask = attn_bias)

            # MHSA layer
            # float mask adds learnable additive attention bias
            x_in = x
            x, attn_weights = layer(
                x, x, x,
                attn_mask = attn_bias,
                average_attn_weights = False
            )
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

            self.attn_weights_list.append(attn_weights)

        x = self.lin_out(x)

        return x.log_softmax(dim=-1)



class DenseGraphTransformerModel_V2(Module):

    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            pos_enc_dim: int = 16,
            hidden_dim: int = 128,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_pos_enc = Linear(pos_enc_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)

        self.layers = ModuleList()
        for layer in range(num_layers):
            self.layers.append(
                MultiheadAttention(
                    embed_dim = hidden_dim,
                    num_heads = num_heads,
                    dropout = dropout
                )
            )


        self.attn_bias_scale = torch.nn.Parameter(torch.tensor([10.0]))  # controls how much we initially bias our model to nearby nodes
        self.dropout = dropout

    def forward(self, x, pos_enc, dense_sp_matrix):

        x = self.lin_in(x) + self.lin_pos_enc(pos_enc)
        # x = self.lin_in(x)  # no node positional encoding

        # attention bias
        # [i, j] -> inverse of shortest path distance b/w node i and j
        # diagonals -> self connection, set to 0
        # disconnected nodes -> -1
        # attn_bias = self.attn_bias_scale * torch.nan_to_num(
        #     (1 / (torch.nan_to_num(dense_sp_matrix, nan=-1, posinf=-1, neginf=-1))),
        #     nan=0, posinf=0, neginf=0
        # )
        #attn_bias = torch.ones_like(attn_bias)

        # TransformerEncoder
        # x = self.encoder(x, mask = attn_bias)

        self.attn_weights_list = []

        for layer in self.layers:
            # # TransformerEncoderLayer
            # # float mask adds learnable additive attention bias
            # x = layer(x, src_mask = attn_bias)

            # MHSA layer
            # float mask adds learnable additive attention bias
            x_in = x
            x, attn_weights = layer(
                x, x, x,
                # attn_mask = attn_bias,
                average_attn_weights = False
            )
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

            self.attn_weights_list.append(attn_weights)

        x = self.lin_out(x)

        return x.log_softmax(dim=-1)

# Train GCN

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GNNModel(num_heads=1, num_layers=3).to(device)

data = data.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001,  weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2000, gamma=0.5)


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


import torch

@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index).argmax(dim=-1)
    class_correct = torch.zeros(data.y.max() + 1)
    class_total = torch.zeros(data.y.max() + 1)

    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        mask_pred = pred[mask]
        mask_true = data.y[mask]

        for i in range(data.y.max() + 1):
            class_total[i] += (mask_true == i).sum().item()
            class_correct[i] += ((mask_pred == i) & (mask_true == i)).sum().item()

    class_accs = class_correct / class_total
    return class_accs.tolist()

best_val_acc = [0] * (data.y.max() + 1)
test_acc = [0] * (data.y.max() + 1)
times = []

num_epochs = 20000
for epoch in range(1, num_epochs + 1):
    start = time.time()
    loss = train()
    if (epoch % 1000 == 0 or epoch == num_epochs):
        print("Epoch: ", epoch, " class accuracies: ", test()) 

    # train_accs, val_accs, tmp_test_acc = test()
    # Update the best validation and test accuracy
    # for i, (val_acc, test_acc) in enumerate(zip(val_accs, test_accs)):
    #     if val_acc > best_val_acc[i]:
    #         best_val_acc[i] = val_acc
    #         test_acc[i] = test_acc

    # print(f'Epoch: {epoch:04d}, Loss: {loss:.4f}')
    # for i, (train_acc, val_acc, tmp_test_acc, best_test_acc) in enumerate(zip(train_accs, val_accs, test_accs, test_acc)):
    #     print(f'Class {i}: Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {tmp_test_acc:.4f}, Best Test: {best_test_acc:.4f}')

    times.append(time.time() - start)
    scheduler.step()

# Print the median time per epoch
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch:  1000  class accuracies:  [0.987500011920929, 0.9125000238418579, 0.793749988079071, 0.7875000238418579, 0.96875, 0.856249988079071, 0.824999988079071, 0.6625000238418579]
Epoch:  2000  class accuracies:  [0.987500011920929, 0.90625, 0.793749988079071, 0.7749999761581421, 0.9624999761581421, 0.8500000238418579, 0.831250011920929, 0.625]
Epoch:  3000  class accuracies:  [0.9937499761581421, 0.90625, 0.762499988079071, 0.7875000238418579, 0.9750000238418579, 0.856249988079071, 0.824999988079071, 0.612500011920929]
Epoch:  4000  class accuracies:  [0.9937499761581421, 0.90625, 0.7875000238418579, 0.800000011920929, 0.9624999761581421, 0.8500000238418579, 0.8374999761581421, 0.625]
Epoch:  5000  class accuracies:  [0.987500011920929, 0.8999999761581421, 0.793749988079071, 0.7875000238418579, 0.96875, 0.84375, 0.8500000238418579, 0.625]
Epoch:  6000  class accuracies:  [0.987500011920929, 0.893750011920929, 0.793749988079071, 0.800000011920929, 0.96875, 0.8500000238418579, 0.83125001

In [None]:
[0.981249988079071, 0.862500011920929, 0.762499988079071, 0.6875, 0.9937499761581421, 0.856249988079071, 0.8187500238418579, 0.4625000059604645]


Epoch:  19999  class accuracies:  [0.9937499761581421, 0.831250011920929, 0.862500011920929, 0.6875, 1.0, 0.8374999761581421, 0.8500000238418579, 0.8125]

Epoch:  19999  class accuracies:  [0.987500011920929, 0.862500011920929, 0.84375, 0.800000011920929, 0.987500011920929, 0.862500011920929, 0.8687499761581421, 0.6000000238418579]

# Sparse Transformer Train

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SparseGraphTransformerModel(num_heads=1, num_layers=3).to(device)

data.dense_adj = to_dense_adj(data.edge_index, max_num_nodes = data.x.shape[0])[0]
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001,  weight_decay=1e-4)


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.dense_adj)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred, accs = model(data.x, data.dense_adj).argmax(dim=-1), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    return accs


best_val_acc = test_acc = 0
times = []
for epoch in range(1, 300):
    start = time.time()
    loss = train()
    train_acc, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    print(f'Epoch: {epoch:04d}, Loss: {loss:.4f} Train: {train_acc:.4f}, '
          f'Val: {val_acc:.4f}, Test: {tmp_test_acc:.4f}, '
          f'Final Test: {test_acc:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch: 0001, Loss: 2.1278 Train: 0.1977, Val: 0.2024, Test: 0.2321, Final Test: 0.2321
Epoch: 0002, Loss: 1.9987 Train: 0.2589, Val: 0.2500, Test: 0.2917, Final Test: 0.2917
Epoch: 0003, Loss: 1.8868 Train: 0.2793, Val: 0.2857, Test: 0.3393, Final Test: 0.3393
Epoch: 0004, Loss: 1.7920 Train: 0.3023, Val: 0.3214, Test: 0.3393, Final Test: 0.3393
Epoch: 0005, Loss: 1.7236 Train: 0.3240, Val: 0.3095, Test: 0.3393, Final Test: 0.3393
Epoch: 0006, Loss: 1.6534 Train: 0.3431, Val: 0.3036, Test: 0.3631, Final Test: 0.3393
Epoch: 0007, Loss: 1.6038 Train: 0.3457, Val: 0.2976, Test: 0.3631, Final Test: 0.3393
Epoch: 0008, Loss: 1.5441 Train: 0.3482, Val: 0.3095, Test: 0.3690, Final Test: 0.3393
Epoch: 0009, Loss: 1.5255 Train: 0.3673, Val: 0.2976, Test: 0.3631, Final Test: 0.3393
Epoch: 0010, Loss: 1.5169 Train: 0.3852, Val: 0.3095, Test: 0.3512, Final Test: 0.3393
Epoch: 0011, Loss: 1.4800 Train: 0.3852, Val: 0.3155, Test: 0.3333, Final Test: 0.3393
Epoch: 0012, Loss: 1.4466 Train: 0.4018, Va

# Dense Transformer Train

In [9]:
def get_shortest_path_matrix(adjacency_matrix):
    # Convert the adjacency matrix to a NetworkX graph
    graph = nx.from_numpy_array(adjacency_matrix.cpu().numpy(), create_using=nx.DiGraph)
    # Compute the shortest path matrix using Floyd-Warshall algorithm in NetworkX
    shortest_path_matrix = nx.floyd_warshall_numpy(graph)
    # Convert numpy array back to torch tensor
    shortest_path_matrix = torch.tensor(shortest_path_matrix).float()
    return shortest_path_matrix

dense_adj = to_dense_adj(data.edge_index, max_num_nodes = data.x.shape[0])[0]
dense_shortest_path_matrix = get_shortest_path_matrix(dense_adj)  # takes about 1-2 mins

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DenseGraphTransformerModel(num_heads=1, num_layers=3).to(device)

data = T.AddLaplacianEigenvectorPE(k = 16, attr_name = 'pos_enc')(data)
# data = T.AddRandomWalkPE(walk_length = 16, attr_name = 'pos_enc')(data)
data.dense_adj = to_dense_adj(data.edge_index, max_num_nodes = data.x.shape[0])[0]
data.dense_sp_matrix = dense_shortest_path_matrix.float()  # pre-computed in previous cell
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001,  weight_decay=1e-4)


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.pos_enc, data.dense_sp_matrix)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred, accs = model(data.x, data.pos_enc, data.dense_sp_matrix).argmax(dim=-1), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    return accs


best_val_acc = test_acc = 0
times = []
for epoch in range(1, 100):
    start = time.time()
    loss = train()
    train_acc, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    print(f'Epoch: {epoch:04d}, Loss: {loss:.4f} Train: {train_acc:.4f}, '
          f'Val: {val_acc:.4f}, Test: {tmp_test_acc:.4f}, '
          f'Final Test: {test_acc:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

# Notes
# - Dense Transformer needs to be trained for a bit longer to reach low loss value
# - Node positional encodings are not particularly useful
# - Edge distance encodings are very useful
# - Since Cora is highly homophilic, it is important to bias the attention towards nearby nodes

Epoch: 0001, Loss: 2.1736 Train: 0.1671, Val: 0.1310, Test: 0.1488, Final Test: 0.1488
Epoch: 0002, Loss: 2.0509 Train: 0.2398, Val: 0.1726, Test: 0.1726, Final Test: 0.1726
Epoch: 0003, Loss: 1.9389 Train: 0.2730, Val: 0.1726, Test: 0.1964, Final Test: 0.1726
Epoch: 0004, Loss: 1.8611 Train: 0.2997, Val: 0.2083, Test: 0.2381, Final Test: 0.2381
Epoch: 0005, Loss: 1.7776 Train: 0.3291, Val: 0.2143, Test: 0.2738, Final Test: 0.2738
Epoch: 0006, Loss: 1.6961 Train: 0.3495, Val: 0.2202, Test: 0.3095, Final Test: 0.3095
Epoch: 0007, Loss: 1.6288 Train: 0.3571, Val: 0.2500, Test: 0.2917, Final Test: 0.2917
Epoch: 0008, Loss: 1.5785 Train: 0.3750, Val: 0.2440, Test: 0.3333, Final Test: 0.2917
Epoch: 0009, Loss: 1.5491 Train: 0.3890, Val: 0.2738, Test: 0.3512, Final Test: 0.3512
Epoch: 0010, Loss: 1.5214 Train: 0.3839, Val: 0.2976, Test: 0.3214, Final Test: 0.3214
Epoch: 0011, Loss: 1.5161 Train: 0.3827, Val: 0.2798, Test: 0.3333, Final Test: 0.3214
Epoch: 0012, Loss: 1.4741 Train: 0.3763, Va

# Dense Transformer v2 Train

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DenseGraphTransformerModel_V2(num_heads=1, num_layers=3).to(device)

data = T.AddLaplacianEigenvectorPE(k = 16, attr_name = 'pos_enc')(data)
# data = T.AddRandomWalkPE(walk_length = 16, attr_name = 'pos_enc')(data)
data.dense_adj = to_dense_adj(data.edge_index, max_num_nodes = data.x.shape[0])[0]
data.dense_sp_matrix = dense_shortest_path_matrix.float()  # pre-computed in previous cell
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001,  weight_decay=1e-4)


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.pos_enc, data.dense_sp_matrix)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred, accs = model(data.x, data.pos_enc, data.dense_sp_matrix).argmax(dim=-1), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    return accs


best_val_acc = test_acc = 0
times = []
for epoch in range(1, 100):
    start = time.time()
    loss = train()
    train_acc, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    print(f'Epoch: {epoch:04d}, Loss: {loss:.4f} Train: {train_acc:.4f}, '
          f'Val: {val_acc:.4f}, Test: {tmp_test_acc:.4f}, '
          f'Final Test: {test_acc:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

# Notes
# - Dense Transformer needs to be trained for a bit longer to reach low loss value
# - Node positional encodings are not particularly useful
# - Edge distance encodings are very useful
# - Since Cora is highly homophilic, it is important to bias the attention towards nearby nodes

Epoch: 0001, Loss: 2.0431 Train: 0.2500, Val: 0.2143, Test: 0.2917, Final Test: 0.2917
Epoch: 0002, Loss: 1.9753 Train: 0.2730, Val: 0.2381, Test: 0.3095, Final Test: 0.3095
Epoch: 0003, Loss: 1.9235 Train: 0.2768, Val: 0.2381, Test: 0.3155, Final Test: 0.3095
Epoch: 0004, Loss: 1.8631 Train: 0.2857, Val: 0.2381, Test: 0.3393, Final Test: 0.3095
Epoch: 0005, Loss: 1.7991 Train: 0.2755, Val: 0.2440, Test: 0.3333, Final Test: 0.3333
Epoch: 0006, Loss: 1.7316 Train: 0.2704, Val: 0.2321, Test: 0.3214, Final Test: 0.3333
Epoch: 0007, Loss: 1.6701 Train: 0.2883, Val: 0.2500, Test: 0.3036, Final Test: 0.3036
Epoch: 0008, Loss: 1.6010 Train: 0.2946, Val: 0.2262, Test: 0.2976, Final Test: 0.3036
Epoch: 0009, Loss: 1.5371 Train: 0.2895, Val: 0.2262, Test: 0.2917, Final Test: 0.3036
Epoch: 0010, Loss: 1.5012 Train: 0.3048, Val: 0.2083, Test: 0.3274, Final Test: 0.3036
Epoch: 0011, Loss: 1.4489 Train: 0.3061, Val: 0.2619, Test: 0.2798, Final Test: 0.2798
Epoch: 0012, Loss: 1.4781 Train: 0.3048, Va