In [1]:
import dgl
import dgl.data
import dgl.nn as gnn

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

Using backend: pytorch


# Homogenious graph node classification

Датасет Cora: 
* узлы - статьи
* связи - цитирование одной статьей другой
* каждый узел в качестве фичей содержит нормализованный word count vector 

Датасет может состоять из одного или нескольких графов. Cora состоит из одного.

Граф в DGL может хранить фичи для узлов и ребер в виде словарей `ndata` и `edata`. 

Фичи узлов в Cora:
* x_mask - булев тензор, показывающий, входит ли узел в множество x (train, val, test)
* label - метка узла
* feat - фичи узла

In [8]:
dataset = dgl.data.CoraGraphDataset()
G = dataset[0]

print(f"Кол-во категорий: {dataset.num_classes}")

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Кол-во категорий: 7


In [10]:
class GCN(nn.Module):
    def __init__(self, n_input, n_hidden, n_output):
        super().__init__()
        self.conv1 = gnn.GraphConv(n_input, n_hidden)
        self.conv2 = gnn.GraphConv(n_hidden, n_output)
    
    def forward(self, G, in_features):
        out = F.relu(self.conv1(G, in_features))
        out = self.conv2(G, out)
        return out

In [13]:
n_input = G.ndata['feat'].shape[1]
n_hidden = 16
n_out = dataset.num_classes
n_epochs = 100

model = GCN(n_input, n_hidden, n_out)

In [15]:
# check
model(G, G.ndata['feat']).shape

torch.Size([2708, 7])

In [18]:
model = GCN(n_input, n_hidden, n_out)

optimizer = optim.Adam(model.parameters(), lr=.01)
criterion = nn.CrossEntropyLoss()

best_val_acc, best_test_acc = 0, 0


features = G.ndata['feat']
labels = G.ndata['label']
train_mask = G.ndata['train_mask']
val_mask = G.ndata['val_mask']
test_mask = G.ndata['test_mask']

for epoch in range(n_epochs):
    # forward
    logits = model(G, features)
    
    # loss
    loss = criterion(logits[train_mask], labels[train_mask])

    # backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # eval
    with torch.no_grad():
        predictions = logits.argmax(dim=1)
        train_acc = (predictions[train_mask] == labels[train_mask]).float().mean()
        val_acc = (predictions[val_mask] == labels[val_mask]).float().mean()
        test_acc = (predictions[test_mask] == labels[test_mask]).float().mean()

        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

    if not epoch % 5:
        print(f'In epoch {epoch}, loss: {loss:.3f}, val acc: {val_acc:.3f} (best {best_val_acc:.3f}), test acc: {test_acc:.3f} (best {best_test_acc:.3f})')

In epoch 0, loss: 1.945, val acc: 0.172 (best 0.172), test acc: 0.159 (best 0.159)
In epoch 5, loss: 1.886, val acc: 0.516 (best 0.540), test acc: 0.527 (best 0.546)
In epoch 10, loss: 1.804, val acc: 0.590 (best 0.590), test acc: 0.612 (best 0.612)
In epoch 15, loss: 1.699, val acc: 0.622 (best 0.622), test acc: 0.631 (best 0.631)
In epoch 20, loss: 1.572, val acc: 0.636 (best 0.636), test acc: 0.646 (best 0.646)
In epoch 25, loss: 1.425, val acc: 0.644 (best 0.646), test acc: 0.657 (best 0.656)
In epoch 30, loss: 1.263, val acc: 0.658 (best 0.658), test acc: 0.673 (best 0.668)
In epoch 35, loss: 1.094, val acc: 0.692 (best 0.692), test acc: 0.694 (best 0.694)
In epoch 40, loss: 0.927, val acc: 0.704 (best 0.704), test acc: 0.712 (best 0.712)
In epoch 45, loss: 0.771, val acc: 0.718 (best 0.718), test acc: 0.725 (best 0.723)
In epoch 50, loss: 0.632, val acc: 0.730 (best 0.730), test acc: 0.740 (best 0.740)
In epoch 55, loss: 0.515, val acc: 0.754 (best 0.754), test acc: 0.761 (best 0

# Heterogenious graph node classification

Для гетерографов можно использовать модули, которые позволяют собирать сообщения от узлов вдоль всех типов связей. Пример: `HeteroGraphConv`. С его помощью выполняем рассылку сообщений по типам связей, затем комбинируем различные сверточные модули для каждого типа связи.


In [39]:
class RGCN(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_outputs, rel_names):
        super().__init__()
        # HeteroGraphConv использует различные подмодули для подграфов на 
        # основе соответствующих отношений
        # отношение определяется тройкой (src_T, rel_T, dst_T)
        # если для каких-то отношений используются одинаковые dst_T,
        # то результаты для них будут сагрегированы указанным методом aggregate
        conv1_modules = {rel: gnn.GraphConv(n_inputs, n_hidden) for rel in rel_names}
        conv2_modules = {rel: gnn.GraphConv(n_hidden, n_outputs) for rel in rel_names}
        self.conv1 = gnn.HeteroGraphConv(conv1_modules, aggregate='sum')
        self.conv2 = gnn.HeteroGraphConv(conv2_modules, aggregate='sum')

    def forward(self, G, features):
        # HeteroGraphConv принимает на вход словарь тип отношения: фичи узлов и 
        # возвращает словарь такой же структуры
        out = self.conv1(G, features)
        out = {k: F.relu(v) for k, v in out.items()}
        out = self.conv2(G, out)
        return out

In [44]:
from utils import create_heterograph

G = create_heterograph()
model = RGCN(G.n_hetero_features, 
             20, 
             G.n_user_classes,
             G.etypes)

user_feats = G.nodes['user'].data['feature']
item_feats = G.nodes['item'].data['feature']
node_features = {ntype: G.nodes[ntype].data['feature'] for ntype in G.ntypes}
labels = G.nodes['user'].data['label']
train_mask = G.nodes['user'].data['train_mask']

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

for epoch in range(5):
    model.train()
    # forward
    logits_by_type = model(G, node_features)
    # обучаемся только на пользователях
    logits = logits_by_type['user']
    loss = criterion(logits, labels)
    # backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f'Epoch #{epoch} loss={loss.item()}')


Epoch #0 loss=1.8041772842407227
Epoch #1 loss=1.795522928237915
Epoch #2 loss=1.7874788045883179
Epoch #3 loss=1.7800294160842896
Epoch #4 loss=1.7731542587280273


In [234]:
class RGCN(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_outputs, num_rels, dropout=0):
        super().__init__()
        self.n_hidden = n_hidden
        self.n_outputs = n_outputs

        self.conv1 = gnn.RelGraphConv(n_inputs, n_hidden, num_rels, 
                                      activation=F.relu, self_loop=True,
                                      dropout=dropout)
        self.conv2 = gnn.RelGraphConv(n_hidden, n_hidden, num_rels, 
                                      activation=F.relu, self_loop=True,
                                      dropout=dropout)                              
        self.conv3 = gnn.RelGraphConv(n_hidden, n_outputs, num_rels, 
                                      activation=None, self_loop=True)                              

    def forward(self, G, features, etypes):
        out = self.conv1(G, features, etypes)
        out = self.conv2(G, out, etypes)
        out = self.conv3(G, out, etypes)
        return out

class GCN(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_outputs):
        super().__init__()
        self.n_hidden = n_hidden
        self.n_outputs = n_outputs

        self.conv1 = gnn.SAGEConv(n_inputs, n_hidden, aggregator_type='mean', 
                                  activation=F.relu)
        self.conv2 = gnn.SAGEConv(n_hidden, n_hidden, aggregator_type='mean', 
                                  activation=F.relu)                              
        self.conv3 = gnn.SAGEConv(n_hidden, n_outputs, aggregator_type='mean')                              

    def forward(self, G, features):
        out = self.conv1(G, features)
        out = self.conv2(G, out)
        out = self.conv3(G, out)
        return out

In [235]:
from dgl.data.rdf import AIFBDataset

dataset = AIFBDataset()
G = dataset[0]
num_classes = dataset.num_classes

# обучаемся только на этой категории
category = dataset.predict_category #  Personnen
category_id = G.ntypes.index(category)
num_rels = len(G.etypes)
print(f'{G.num_nodes(category)=}')

# маски
for ntype in G.ntypes:
    if ntype != category:
        G.nodes[ntype].data['train_mask'] = torch.zeros(G.num_nodes(ntype), dtype=torch.uint8)
        G.nodes[ntype].data['test_mask'] = torch.zeros(G.num_nodes(ntype), dtype=torch.uint8)
        G.nodes[ntype].data['labels'] = torch.zeros(G.num_nodes(ntype), dtype=torch.int64)
    G.nodes[ntype].data['features'] = torch.ones((G.num_nodes(ntype), 1), dtype=torch.float32)

# чтобы воспользоваться RelGraphConv перехожу к однородному графу
G = dgl.to_homogeneous(G, ndata=['features', 'train_mask', 'labels', 'test_mask'])
G = dgl.add_self_loop(G)

train_mask = G.ndata['train_mask'].bool()
test_mask = G.ndata['test_mask'].bool()
train_size = len(train_mask.nonzero())
test_size = len(test_mask.nonzero())

labels = G.ndata['labels'] # содержат -1, но они не попадают под маски
features = G.ndata['features']
etypes = G.edata['_TYPE']

Done loading data from cached files.
G.num_nodes(category)=237


In [220]:
model = RGCN(n_inputs=1, n_hidden=20, 
            n_outputs=num_classes,
            num_rels=num_rels)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    model.train()
    # forward
    logits = model(G, features, etypes)
    loss = criterion(logits[train_mask], labels[train_mask])
    # backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if not epoch % 5:
        with torch.no_grad():
            model.eval()
            predictions = logits.argmax(dim=1)
            train_acc = (predictions[train_mask] == labels[train_mask]).sum().item() / train_size
            test_acc = (predictions[test_mask] == labels[test_mask]).sum().item() / test_size
        print("Epoch {:05d} | Train Acc: {:.4f} | Train Loss: {:.4f} | Test Acc: {:.4f}".
              format(epoch, train_acc, loss.item(), test_acc))

Epoch 00000 | Train Acc: 0.2786 | Train Loss: 53.3823 | Test Acc: 0.2778
Epoch 00005 | Train Acc: 0.6071 | Train Loss: 2.1778 | Test Acc: 0.5556
Epoch 00010 | Train Acc: 0.6929 | Train Loss: 1.6998 | Test Acc: 0.6667
Epoch 00015 | Train Acc: 0.7714 | Train Loss: 1.5508 | Test Acc: 0.7222
Epoch 00020 | Train Acc: 0.8143 | Train Loss: 1.3119 | Test Acc: 0.7500
Epoch 00025 | Train Acc: 0.8071 | Train Loss: 1.0918 | Test Acc: 0.7500
Epoch 00030 | Train Acc: 0.8214 | Train Loss: 0.8326 | Test Acc: 0.7500
Epoch 00035 | Train Acc: 0.8571 | Train Loss: 0.5455 | Test Acc: 0.8333
Epoch 00040 | Train Acc: 0.8714 | Train Loss: 0.3491 | Test Acc: 0.8611
Epoch 00045 | Train Acc: 0.8929 | Train Loss: 0.2816 | Test Acc: 0.8889
Epoch 00050 | Train Acc: 0.9286 | Train Loss: 0.2489 | Test Acc: 0.8611
Epoch 00055 | Train Acc: 0.9429 | Train Loss: 0.2266 | Test Acc: 0.8889
Epoch 00060 | Train Acc: 0.9500 | Train Loss: 0.2192 | Test Acc: 0.8889
Epoch 00065 | Train Acc: 0.9500 | Train Loss: 0.2118 | Test Acc

In [241]:
model = GCN(n_inputs=1, n_hidden=50, n_outputs=num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(101):
    model.train()
    # forward
    logits = model(G, features)
    loss = criterion(logits[train_mask], labels[train_mask])
    # backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if not epoch % 5:
        with torch.no_grad():
            model.eval()
            predictions = logits.argmax(dim=1)
            train_acc = (predictions[train_mask] == labels[train_mask]).sum().item() / train_size
            test_acc = (predictions[test_mask] == labels[test_mask]).sum().item() / test_size
        print("Epoch {:05d} | Train Acc: {:.4f} | Train Loss: {:.4f} | Test Acc: {:.4f}".
              format(epoch, train_acc, loss.item(), test_acc))

Epoch 00000 | Train Acc: 0.0857 | Train Loss: 1.4893 | Test Acc: 0.0833
Epoch 00005 | Train Acc: 0.4143 | Train Loss: 1.3047 | Test Acc: 0.4167
Epoch 00010 | Train Acc: 0.4143 | Train Loss: 1.2448 | Test Acc: 0.4167
Epoch 00015 | Train Acc: 0.4143 | Train Loss: 1.2370 | Test Acc: 0.4167
Epoch 00020 | Train Acc: 0.4143 | Train Loss: 1.2373 | Test Acc: 0.4167
Epoch 00025 | Train Acc: 0.4143 | Train Loss: 1.2400 | Test Acc: 0.4167
Epoch 00030 | Train Acc: 0.4143 | Train Loss: 1.2387 | Test Acc: 0.4167
Epoch 00035 | Train Acc: 0.4143 | Train Loss: 1.2362 | Test Acc: 0.4167
Epoch 00040 | Train Acc: 0.4143 | Train Loss: 1.2345 | Test Acc: 0.4167
Epoch 00045 | Train Acc: 0.4143 | Train Loss: 1.2335 | Test Acc: 0.4167
Epoch 00050 | Train Acc: 0.4143 | Train Loss: 1.2336 | Test Acc: 0.4167
Epoch 00055 | Train Acc: 0.4143 | Train Loss: 1.2337 | Test Acc: 0.4167
Epoch 00060 | Train Acc: 0.4143 | Train Loss: 1.2337 | Test Acc: 0.4167
Epoch 00065 | Train Acc: 0.4143 | Train Loss: 1.2336 | Test Acc: