## Построение GNN для классификации графов с использованием PyG

In [1]:
import torch
from torch_geometric.datasets import TUDataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# несколько удобных функций для описания датасетов
def describe_dataset(dataset):
    print(f'Dataset: {dataset}:')
    print('======================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Number of features: {dataset.num_features}')
    print(f'Number of classes: {dataset.num_classes}')

def describe_graph(g):
    print(g)
    print('==============================================================')

    # Gather some statistics about the graph.
    print(f'Number of nodes: {g.num_nodes}')
    print(f'Number of edges: {g.num_edges}')
    print(f'Average node degree: {g.num_edges / g.num_nodes:.2f}')
    if hasattr(g, 'train_mask'):
        print(f'Number of training nodes: {g.train_mask.sum()}')
        print(f'Training node label rate: {int(g.train_mask.sum()) / g.num_nodes:.2f}')
    print(f'Has isolated nodes: {g.has_isolated_nodes()}')
    print(f'Has self-loops: {g.has_self_loops()}')
    print(f'Is undirected: {g.is_undirected()}')

In [5]:
dataset = TUDataset(root='./tmp/TUDataset', name='MUTAG')
g = dataset[0]
describe_dataset(dataset)
describe_graph(g)

Dataset: MUTAG(188):
Number of graphs: 188
Number of features: 7
Number of classes: 2
Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Has isolated nodes: False
Has self-loops: False
Is undirected: True


Видим, что для графа имеется одна метка y=[1], которую нам и нужно уметь предсказывать.

Перемешиваем датасет и разбиваем на обучающее и тестовое множество.

In [9]:
dataset = dataset.shuffle()
n_train = len(dataset) * 8 // 10
print(f'{n_train=}')
train_dataset = dataset[: n_train]
test_dataset = dataset[n_train: ]

n_train=150


Для получения пакетов из графов используем  `torch_geometric.loader.DataLoader`

In [11]:
from torch_geometric.loader import DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [13]:
batch = next(iter(train_loader))
batch

DataBatch(edge_index=[2, 2570], x=[1167, 7], edge_attr=[2570, 4], y=[64], batch=[1167], ptr=[65])

При решении задачи классификации графов появляется дополнительный шаг - агрегация эмбеддингов узлов (readout)

In [14]:
import torch_geometric.nn as gnn

In [24]:
# 1. Получение эмбеддингов узлов
layer = gnn.GCNConv(in_channels=7, out_channels=5)
h_e = layer(batch.x, batch.edge_index)
print(h_e.shape)
# 2. Шаг агрегации (readout)
h_g = gnn.global_mean_pool(h_e, batch.batch)
print(h_g.shape)

torch.Size([1167, 5])
torch.Size([64, 5])


In [30]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric.nn as gnn
import torch.nn.functional as F

class GCN(nn.Module):
    def __init__(
        self, 
        n_input: int, 
        n_hidden_layers: int, 
        n_hidden: int, 
        n_out: int,
        n_classes: int,
        dropout_p: float = 0.2,
        activation: callable = F.relu
    ) -> None:
        super().__init__()
        self.dropout_p = dropout_p
        self.activation = activation
        # подход к построению глубоких GNN взят отсюда: 
        # https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/advanced/model.py
        self.layers = nn.ModuleList()
        if n_hidden_layers > 1:
            self.layers.append(gnn.GCNConv(n_input, n_hidden))
            for _ in range(1, n_hidden_layers-1):
                self.layers.append(gnn.GCNConv(n_hidden, n_hidden))
            self.layers.append(gnn.GCNConv(n_hidden, n_out))
        else:
            self.layers.append(gnn.GCNConv(n_input, n_out))
        self.classifier = nn.Linear(n_out, n_classes)

    def forward(self, x, edge_index, batch):
        # 1. Получение эмбеддингов узлов
        h = x
        for layer in self.layers:
            h = layer(h, edge_index)
            h = self.activation(h)
            h = F.dropout(h, p=self.dropout_p, training=self.training)
        
        # 2. Агрегация
        h = gnn.global_mean_pool(h, batch)
        h = F.dropout(h, p=self.dropout_p, training=self.training)
        # 3. Полносвязный слой для классификации графа
        h = self.classifier(h)
        return h

In [35]:
model = GCN(
    dataset.num_features, 
    n_hidden_layers=3, 
    n_hidden=64,
    n_out=64, 
    n_classes=dataset.num_classes,
    activation=torch.relu,
    dropout_p=0.5
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=.01, weight_decay=5e-4)

for epoch in range(201):
    epoch_losses = []
    epoch_acc_train = 0
    epoch_acc_test = 0
    # train
    model.train()
    for step, data in enumerate(train_loader):  # Итерируемся по пакетам в обучающей выборке.
        logits = model(data.x, data.edge_index, data.batch)  
        loss = criterion(logits, data.y)
        loss.backward()  
        optimizer.step()
        optimizer.zero_grad()

        epoch_losses.append(loss.item())
        epoch_acc_train += (logits.argmax(dim=1) == data.y).sum().item()
    epoch_acc_train /= len(train_loader.dataset)
    model.eval()
    # eval test
    for data in test_loader:
        logits = model(data.x, data.edge_index, data.batch)  
        epoch_acc_test += (logits.argmax(dim=1) == data.y).sum().item()
    epoch_acc_test /= len(test_loader.dataset)


    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d} Avg Loss: {np.mean(epoch_losses):.4f} '
              f'Train Acc: {epoch_acc_train:.4f} Test Acc: {epoch_acc_test:.4f}')

Epoch: 000 Avg Loss: 0.6498 Train Acc: 0.6533 Test Acc: 0.7105
Epoch: 010 Avg Loss: 0.6019 Train Acc: 0.6667 Test Acc: 0.7105
Epoch: 020 Avg Loss: 0.5662 Train Acc: 0.7200 Test Acc: 0.7895
Epoch: 030 Avg Loss: 0.5588 Train Acc: 0.7067 Test Acc: 0.7895
Epoch: 040 Avg Loss: 0.5410 Train Acc: 0.7200 Test Acc: 0.8421
Epoch: 050 Avg Loss: 0.5351 Train Acc: 0.7600 Test Acc: 0.8158
Epoch: 060 Avg Loss: 0.4531 Train Acc: 0.7800 Test Acc: 0.7895
Epoch: 070 Avg Loss: 0.4882 Train Acc: 0.7600 Test Acc: 0.7368
Epoch: 080 Avg Loss: 0.5115 Train Acc: 0.7600 Test Acc: 0.7368
Epoch: 090 Avg Loss: 0.5188 Train Acc: 0.7533 Test Acc: 0.6842
Epoch: 100 Avg Loss: 0.4796 Train Acc: 0.7933 Test Acc: 0.7368
Epoch: 110 Avg Loss: 0.5166 Train Acc: 0.7533 Test Acc: 0.7895
Epoch: 120 Avg Loss: 0.4981 Train Acc: 0.7467 Test Acc: 0.7368
Epoch: 130 Avg Loss: 0.4815 Train Acc: 0.7733 Test Acc: 0.7105
Epoch: 140 Avg Loss: 0.4757 Train Acc: 0.7467 Test Acc: 0.6842
Epoch: 150 Avg Loss: 0.4666 Train Acc: 0.7800 Test Acc: