In [132]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

import torch#, time
import torch.nn as nn
device = torch.device("cpu")# torch.device("cuda" if torch.cuda.is_available() else "cpu")
from sklearn.preprocessing import OneHotEncoder
import matplotlib.animation as animation
from IPython.display import HTML
from misc_tools.print_latex import print_tex

For derivation see GNN_Attention_notes.ipynb
Im not sure whether implement unique $\vec{a}$ for all heads. Original paper hints on that. Implementation i have found reuses it.


In [133]:
np.random.seed(1337)
#G = nx.house_graph()
N = 3
G = nx.gnm_random_graph(N, 2*N)

A = nx.adjacency_matrix(G).todense()
print_tex('A = ', A)
A = torch.tensor(A)

<IPython.core.display.Math object>

In [134]:
#N_FEATURES, N_NODES = A.shape
N_NODES = len(G.nodes())
N_FEATURES = 2
N_HIDDEN = 2
N_HEADS = 2
OUT_FEATURES = 2

In [135]:
H = torch.arange(N_NODES*N_FEATURES, dtype = float).view(N_NODES, N_FEATURES) + 1
print_tex('H = ', H.numpy())

<IPython.core.display.Math object>

In [136]:
class debug_net(nn.Module):
    def __init__(self, H0, A, test = False):
        super(debug_net, self).__init__()
        self.H = H0
        self.A = A
        self.W_gh   = nn.Linear(in_features=N_FEATURES, out_features=N_HEADS*N_HIDDEN, bias=False, dtype=H0.dtype)
        self.Gk     = torch.zeros(size=(N_NODES, N_HEADS*N_HIDDEN), dtype=H0.dtype)
        self.GkR    = self.Gk.view(N_NODES, N_HEADS, N_FEATURES)
        self.Ck_l   = torch.zeros(size=(N_NODES*N_NODES, N_HEADS, N_HIDDEN), dtype=H0.dtype) 
        self.Ck_r   = torch.zeros_like(self.Ck_l)
        self.Ck_f   = torch.zeros(size=(N_NODES*N_NODES, N_HEADS, 2*N_HIDDEN), dtype=H0.dtype) 
        self.Ck     = self.Ck_f.view(N_NODES, N_NODES, N_HEADS, 2*N_HIDDEN)
        self.attnt  = nn.Parameter(torch.zeros(size=(2*N_HIDDEN, N_HEADS), dtype=H0.dtype))
        self.activ  = nn.LeakyReLU(0.2)
        self.E      = torch.zeros(size=(N_NODES,N_NODES, N_HEADS), dtype=H0.dtype)
        self.alpha  = torch.zeros_like(self.E)
        self.softmax= nn.Softmax(dim = 1)
        self.GkPrime= torch.zeros_like(self.GkR)
         
        if test:
            self.debug()

    def debug(self):
        with torch.no_grad():
            print_tex(r'N_{nodes} = '+ str(N_NODES) + r'; \ N_{heads} = '+ str(N_HEADS) + r'; \ N_{features} = '+ str(N_FEATURES)+ r'; \ N_{hidden} = '+ str(N_HIDDEN))
            print_tex(r'G_{K} \text{ is a matrix of concatenated embeddings } \vec{g}_i^{k} , \ shape : [N_{nodes} \times N_{heads}*N_{hidden}]')

            # set scaling transforms
            for i in range(N_HEADS):
                s = 1 if i == 0 else 4*i

                self.W_gh.weight[i*N_FEATURES:(i+1)*N_FEATURES] = s*torch.eye(N_FEATURES)

            self.Gk += self.W_gh(self.H)        # cannot redefine, it will break a view

            print_tex('G_K = H W_K^T = ', H.numpy() , self.W_gh.weight[:].T.numpy(),' = ', self.Gk.numpy())
            print_tex(r'\text{Reshape } G_{K} \ to \ [N_{nodes} \times N_{heads} \times N_{hidden}] \text{ to isolate each head`s data to its own dimension}')

            print_tex(r"\text{Goal: a matrix } C_K \text{ that holds concatenated node feature pairs. Shape: }[N_{nodes} \times N_{nodes}\times N_{heads} \times 2 N_{hidden}]")
            print("Its only (?) possible with flattening, concatenating and unflattening. See notes.")

            self.Ck_l += self.GkR.repeat_interleave(N_NODES, dim=0)
            self.Ck_r += self.GkR.repeat(N_NODES, 1, 1)
            self.Ck_f += torch.cat([self.Ck_l, self.Ck_r], dim=-1);
            
            print_tex(r'C_{flat} \ (K=1) = Repeat_{flat} \ ||  \ Interleave_{flat} = '
                      ,self.Ck_l[:,[0]].squeeze(1).numpy(),r' \ \bigg|\bigg| \ ', self.Ck_r[:,[0]].squeeze(1).numpy(), ' = ',
                      self.Ck_f[:,[0]].squeeze(1).numpy())
            
            print_tex(r'\text{Features C}_{0,0} = ', self.Ck[0,0].numpy(), r'; \ shape: \ [N_{heads} \times 2 N_{hidden}]')
            prnt_vec = [r'\vec{a}_'+str(i)+ ' = ' for i in range(N_HEADS)]
            prnt_vec2 = [r'^T ; \ ' for i in range(N_HEADS)]

            self.attnt += np.repeat(1/(torch.arange(N_HEADS, dtype=self.H.dtype).unsqueeze(0)+1), repeats=2*N_HIDDEN, axis = 0)
            print_tex(r"\text{Goal: a matrix E that holds edge weights. Shape: }[N_{nodes} \times N_{nodes} \times N_{heads}]")
            print_tex(r'E = \sigma(\vec{a}[C_K])')
            print('Test attention vectors:')
            print_tex(*[l for lists in zip(prnt_vec,self.attnt.T.numpy(),prnt_vec2) for l in lists])
            print('>>>See how to apply multiple attention vectors to data in notes<<<')
            
            self.E += self.activ(torch.einsum('ijkf,fk -> ijk', self.Ck, self.attnt)).squeeze(-1)#

            print_tex(r'\text{Features E}_{0,0} = ', self.E[0,0].numpy())
            self.alpha += self.E.masked_fill(self.A.view(N_NODES,N_NODES,1) == 0, float('-inf'))
            for i in range(N_HEADS):
                print_tex('E_{K='+str(i + 1)+'} = ',self.E.numpy()[:,:,i], r'\rightarrow MASK \rightarrow ',(self.E[:,:,i]*self.A).numpy() )
            print_tex(r"\text{Goal: a matrix } \Alpha \ or \ \alpha \text{ with row-wise softmax normalized weights. Shape: }[N_{nodes} \times N_{nodes} \times N_{heads}]")
            self.alpha = self.softmax(self.alpha)
            
            for i in range(N_HEADS):
                print_tex(r'\alpha_{K='+str(i + 1)+'} = ', self.alpha.numpy()[:,:,i], r'{\rightarrow set \ to \ A \ for \ example \rightarrow }:', self.A.numpy())
                self.alpha.numpy()[:,:,i] = self.A
            self.GkPrime += torch.einsum('ijk,jkf->ikf', self.alpha , self.GkR)

            print_tex(r"\text{Goal: updated features } G_k^\prime \text{ based on aggregation of features } \vec{g}_i^k \text{ with weights } \Alpha \text{ . Shape: }[N_{nodes} \times N_{heads} \times N_{hidden}]")
            print('>>>See how to aggregate multi head case in notes<<<')
            n1_neighbors_id = torch.argwhere(self.A[0] == 1).flatten().numpy()
            
            a = [r'G_'+str(i)+ ' = ' for i in n1_neighbors_id]
            b = [self.GkR[i].numpy() for i in n1_neighbors_id]
            c = [r'; \ ' for i in n1_neighbors_id]
            print_tex(*[l for lists in zip(a,b,c) for l in lists], r'\Alpha|_{row,1}= ', self.alpha[0].numpy() )
            print_tex(r'G_0^\prime = \vec{A}_0 \otimes G = ', self.GkPrime[0].numpy())
            print('New embeddings can be either concatenated across different variants of k or averaged"')
            GkP_concat = self.GkPrime.reshape(N_NODES, N_HEADS * N_HIDDEN)
            GkP_avg  = self.GkPrime.mean(dim=1)
            print_tex(r'G_0^{concat} = ', GkP_concat[0].numpy(), r'; \ G_0^{Avg} = ',GkP_avg[0].numpy())


model = debug_net(H,torch.tensor(A),True)

  model = debug_net(H,torch.tensor(A),True)


<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

Its only (?) possible with flattening, concatenating and unflattening. See notes.


<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

Test attention vectors:


<IPython.core.display.Math object>

>>>See how to apply multiple attention vectors to data in notes<<<


<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

>>>See how to aggregate multi head case in notes<<<


<IPython.core.display.Math object>

<IPython.core.display.Math object>

New embeddings can be either concatenated across different variants of k or averaged"


<IPython.core.display.Math object>

In [175]:
class GraphAttentionLayer(nn.Module):
    def __init__(self, A, N_FEATURES, N_HEADS, N_HIDDEN, non_lin, concat: bool = False, l_ReLU_slope: float = 0.2, dropout: float = 0.6, dtype = torch.float):
        super(GraphAttentionLayer, self).__init__()
        # 1) globals
        self.adj = A
        self.N_FEATURES, self.N_HEADS,  self.N_HIDDEN  = N_FEATURES, N_HEADS, N_HIDDEN
        self.concat = concat
        self.dtype = dtype
        # 2) weights 
        # 2.1) project H0 to G      [N_nodes x N_features] -> [N_nodes x N_hidden]
        self.W_gh   = nn.Linear(in_features=N_FEATURES, out_features=N_HEADS*N_HIDDEN, bias=False, dtype=dtype)
        # 2.2) project Ck to alpha  [N_nodes x N_nodes x N_heads x N_features] -> [N_nodes x N_hidden x N_heads]
        self.attention  = nn.Parameter(torch.empty(size=(2*N_HIDDEN, N_HEADS), dtype=dtype))
        nn.init.xavier_uniform_(self.attention) 

        # misc processing
        self.activ_alpha    = nn.LeakyReLU(l_ReLU_slope)
        self.softmax        = nn.Softmax(dim = 1)
        self.activ_Gn_new   = non_lin
        self.dropout        = nn.Dropout(dropout)

    def forward(self, H):
        H = H.to(self.dtype)
        N_NODES = H.size(0)
        N_FEATURES, N_HEADS, N_HIDDEN = self.N_FEATURES, self.N_HEADS,  self.N_HIDDEN
        # 1) Get features
        Gk_reshaped = self.W_gh(H).view(N_NODES, N_HEADS, N_HIDDEN)
        # 2) Prepare attention/adjacency matrix
        # 2.1) Concatenated pairwise features for each head
        C_left  = Gk_reshaped.repeat_interleave(N_NODES, dim=0)
        C_right = Gk_reshaped.repeat(N_NODES, 1, 1)
        Ck      = torch.cat([C_left, C_right], dim=-1).view(N_NODES, N_NODES, N_HEADS, 2*N_HIDDEN)
        print(f'{Ck.shape = }')
        # 2.2) Project concat features to a scalar
        alpha   = self.activ_alpha(torch.einsum('ijkf,fk -> ijk', Ck, self.attention)).view(N_NODES, N_NODES, N_HEADS)
        print(f'{alpha.shape = }')
        # 2.3) Apply adjacency matrix mask, softmax and dropout
        alpha.masked_fill_(self.adj.reshape(N_NODES,N_NODES,1) == 0, float('-inf'))
        alpha = self.softmax(alpha)
        alpha = self.dropout(alpha)
        # 3) Calculate new feature vectors by aggregation, using attention matrix.
        Gk_new = torch.einsum('ijk,jkf->ikf', alpha , Gk_reshaped )
        print(f'{Gk_new.shape = }')
        # 4) Additional processing. Like in paper, its either mean or concat, with activation.
        if self.concat:
            # activation before concat
            Gk_new = self.activ_Gn_new(Gk_new)
            return Gk_new.reshape(N_NODES, N_HEADS * N_HIDDEN)
        else:
            # activation after
            Gk_new = Gk_new.mean(dim=1).view(N_NODES, 1, N_HIDDEN)
            print(f'mean {Gk_new.shape = }')
            return self.activ_Gn_new(Gk_new)
        
model = GraphAttentionLayer(A, N_FEATURES, N_HEADS, N_HIDDEN, nn.Softmax(dim = 2), concat=True)
print_tex(model(H).detach().numpy())

Ck.shape = torch.Size([3, 3, 2, 4])
alpha.shape = torch.Size([3, 3, 2])
Gk_new.shape = torch.Size([3, 2, 2])


<IPython.core.display.Math object>

Recreate case from a paper:

Transductive learning.

1. GAT layers = 2<br>
    1. Layer 1<br>
            1. N_HEADS = 8 <br>
            2. N_HIDDEN = 8 <br>
            3. activation = ELU <br>
            4. concatenation = True <br>
            5. N_FEATURES = original number
    2. Layer 2<br>
        1. N_HEADS = 1 <br>
        2. N_HIDDEN = number of classes <br>
        3. activation = SoftMax <br>
        4. concatenation = false <br>
        5. N_FEATURES = Layer 1 N_HEADS * layer 1 N_HIDDEN
2. regularization = $L_2$, scale = 0.0005
3. dropout, p = 0.6



In [176]:

class net(nn.Module):
    def __init__(self, H0, A, num_classes):
        super(net, self).__init__()
        N_NODES     , N_FEATURES_1  = H0.shape
        N_HEADS_1   , N_HEADS_2     = 8, 1
        N_HIDDEN_1  , N_HIDDEN_2    = 6, num_classes
        N_FEATURES_2                = N_HEADS_1 * N_HIDDEN_1
        self.GAT1 = GraphAttentionLayer(A, N_FEATURES_1, N_HEADS_1, N_HIDDEN_1, nn.ELU(), concat = True)
        self.GAT2 = GraphAttentionLayer(A, N_FEATURES_2, N_HEADS_2, N_HIDDEN_2, nn.Softmax(dim = 2))

    def forward(self, H):
        x = self.GAT1(H)
        y = self.GAT2(x)
        return y
    
model2 = net(H, A, 2)
model2(H)

Ck.shape = torch.Size([3, 3, 8, 12])
alpha.shape = torch.Size([3, 3, 8])
Gk_new.shape = torch.Size([3, 8, 6])
Ck.shape = torch.Size([3, 3, 1, 4])
alpha.shape = torch.Size([3, 3, 1])
Gk_new.shape = torch.Size([3, 1, 2])
mean Gk_new.shape = torch.Size([3, 1, 2])


tensor([[[0.8188, 0.1812]],

        [[0.5000, 0.5000]],

        [[0.8963, 0.1037]]], grad_fn=<SoftmaxBackward0>)

## Some things ive learned
* you can define an array, and its reshaped representation. If you dont redefine array, you can change it, and reshaped representation will also change (duh)

In [139]:
asd = torch.arange(2,5,1);print(asd)
asd2 = asd.reshape(-1,1);print(asd2)
asd += torch.arange(3,6,1);print(asd)
print(asd2)

tensor([2, 3, 4])
tensor([[2],
        [3],
        [4]])
tensor([5, 7, 9])
tensor([[5],
        [7],
        [9]])
