# Graph Attention Network

## Imports

In [1]:
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph

import pickle
import numpy as np

import itertools

import performance as pf

## GNN Definition

In [3]:
class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.g = g
        # equation (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # equation (2)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)

    def edge_attention(self, edges):
        # edge UDF for equation (2)
        z2 = th.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)
        # equation (4)
        h = th.sum(alpha * nodes.mailbox['z'], dim=1)
        return {'h': h}

    def forward(self, h):
        # equation (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        return self.g.ndata.pop('h')

In [4]:
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return th.cat(head_outs, dim=1)
        else:
            # merge using average
            return th.mean(torch.stack(head_outs))

In [5]:
class Net(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(Net, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads)
        # Be aware that the input dimension is hidden_dim*num_heads since
        # multiple head outputs are concatenated together. Also, only
        # one attention head in the output layer.
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1)

    def forward(self, h):
        h = self.layer1(h)
        h = F.elu(h)
        h = self.layer2(h)
        h = F.log_softmax(h, 1)
        return h

In [2]:
import GAT

In [7]:
import importlib
importlib.reload(GAT)

<module 'GAT' from 'C:\\Users\\User\\Documents\\dev\\gnn-community-detection\\Notebooks\\GAT.py'>

## Data Loading

In [3]:
from dgl.data import citation_graph as citegrh
import networkx as nx

data = citegrh.load_cora()
features = th.FloatTensor(data.features)
labels = th.LongTensor(data.labels)
mask = th.ByteTensor(data.train_mask)
g = data.graph

# add self loop
g.remove_edges_from(nx.selfloop_edges(g))
g = DGLGraph(g)
g.add_edges(g.nodes(), g.nodes())

## Select Training Set

In [4]:
percentage_train = 0.1
percentage_val = 0.1

with open("../data/permutations/cora_permutation1.pickle","rb") as f:
    perm1 = pickle.load(f)
mask_train = np.zeros(g.number_of_nodes())
mask_val = np.zeros(g.number_of_nodes())

i_train = int(percentage_train*g.number_of_nodes())
i_val = i_train + int(percentage_val*g.number_of_nodes())
mask_train[perm1[range(0,i_train)]] = 1
mask_val[perm1[range(i_train,i_val)]] = 1
mask_train = th.BoolTensor(mask_train)
mask_val = th.BoolTensor(mask_val)

## Training

In [5]:
loss_function = pf.perm_inv_loss(labels)
import copy

In [41]:
import time

net = GAT.GAT_Net(g=g, in_feats=features.shape[1], hidden_size=100, hidden_layers=0, out_feats=len(np.unique(labels)),
dropout=0, batchnorm=False, num_heads=1)
#print(net)

optimizer = th.optim.Adam(net.parameters(), lr=1e-2, weight_decay=1e-2)
net.train() # Set to training mode (use dropout)

dur = []
loss_ev = []
current_best = 0 #arbitrarily high
current_best_epoch = 0
current_best_params = None
no_improvement_for = 0

for epoch in range(10000):
    if epoch >=3:
        t0 = time.time()

    # Compute loss for test nodes (only for validation, not used by optimizer)
    net.eval()
    prediction = net(features)
    train_rand=pf.rand_score(labels[mask_train].numpy(),np.argmax(prediction[mask_train].detach().numpy(), axis=1))
    validation_rand=pf.rand_score(labels[mask_val].numpy(),np.argmax(prediction[mask_val].detach().numpy(), axis=1))
    if train_rand>current_best:
        current_best = train_rand
        current_best_epoch = epoch
        current_best_params = copy.deepcopy(net.state_dict())
        no_improvement_for = 0
    else: no_improvement_for += 1
    
    if no_improvement_for>100:
        break
    
    net.train()

    # Compute loss for train nodes
    logits = net(features)

    loss = loss_function.approximate_loss(logits,mask_train,nclasses=7)
    #loss = F.nll_loss(logits[mask_train], labels[mask_train])
    loss_ev.append(loss.detach().item())
    print(np.unique(np.argmax(logits[mask_train].detach().numpy(),1)))
    
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch >=3:
        dur.append(time.time() - t0)
        print(f"Epoch {epoch:05d} | Loss {loss.item():.4f} | Train.Rand {train_rand:.4f} | Valid.Rand {validation_rand:.4f} | Time(s) {np.mean(dur):.4f}")
    else:
        print(f"Epoch {epoch:05d} | Loss {loss.item():.4f} | Train.Rand {train_rand:.4f} | Valid.Rand {validation_rand:.4f} | Time(s) unknown")
        
net.load_state_dict(current_best_params)

[0 1 2 3 4 5 6]
Epoch 00000 | Loss 1.9453 | Train.Rand 0.0585 | Valid.Rand 0.0462 | Time(s) unknown
[2 3 4 6]
Epoch 00001 | Loss 1.9422 | Train.Rand 0.1373 | Valid.Rand 0.1070 | Time(s) unknown
[2 3 4 6]
Epoch 00002 | Loss 1.9403 | Train.Rand 0.1174 | Valid.Rand 0.0776 | Time(s) unknown
[3 4 6]
Epoch 00003 | Loss 1.9391 | Train.Rand 0.0607 | Valid.Rand 0.0584 | Time(s) 2.1315
[3 4 6]
Epoch 00004 | Loss 1.9381 | Train.Rand 0.0335 | Valid.Rand 0.0184 | Time(s) 2.1779
[3 4]
Epoch 00005 | Loss 1.9370 | Train.Rand 0.0138 | Valid.Rand 0.0041 | Time(s) 2.2000
[3 4]
Epoch 00006 | Loss 1.9356 | Train.Rand 0.0037 | Valid.Rand 0.0031 | Time(s) 2.1635
[3 4]
Epoch 00007 | Loss 1.9339 | Train.Rand 0.0003 | Valid.Rand 0.0000 | Time(s) 2.1346
[3]
Epoch 00008 | Loss 1.9317 | Train.Rand 0.0000 | Valid.Rand 0.0000 | Time(s) 2.1414
[3]
Epoch 00009 | Loss 1.9290 | Train.Rand 0.0000 | Valid.Rand 0.0000 | Time(s) 2.1492
[3]
Epoch 00010 | Loss 1.9257 | Train.Rand 0.0000 | Valid.Rand 0.0000 | Time(s) 2.1509
[3

KeyboardInterrupt: 

In [42]:
net.load_state_dict(current_best_params)

<All keys matched successfully>

In [34]:
np.unique(np.argmax(logits[mask_train].detach().numpy(),1))

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2], dtype=int64)

In [30]:
np.unique(np.argmax(np.exp(prediction.detach()).numpy(),1))

array([2], dtype=int64)

In [18]:
np.sum(np.exp(prediction.detach()).numpy(),axis=1)

array([0.99999994, 1.0000001 , 1.0000001 , ..., 1.        , 0.9999999 ,
       0.9999999 ], dtype=float32)

In [20]:
# Visualise predictions
net.eval() # Set net to evaluation mode (deactivates dropout)
final_prediction = net(g, features).detach()
a = np.transpose(np.vstack([final_prediction[mask].numpy().argmax(axis=1),labels[mask].numpy()]))
a[a[:,0].argsort()][np.random.choice(range(a.shape[0]),size=10)]
# as can be seen, the net predicts other labels, but gets the clusters right :)

array([[0, 6],
       [0, 3],
       [2, 2],
       [2, 2],
       [0, 6],
       [6, 4],
       [4, 0],
       [4, 0],
       [0, 2],
       [0, 4]], dtype=int64)

## Evaluation

In [45]:
net.eval() # Set net to evaluation mode (deactivates dropout)
final_prediction = net(features).detach()
pf.performance_as_df(labels,final_prediction,mask)

Unnamed: 0,All,Train,Test
Rand-Index,0.085947,0.08386,0.086096
Mutual Information,0.245764,0.248938,0.244443
Variation of Information,1.820591,1.701044,1.822282
