# Graph Transformer Networks (GTN)

GTN model learning node representation on a **heterogeneous** graph. The model consists of two parts: **Convolutional layer** and **Graph Transformer (GT) layer**. GT layer learns a new meta-path graph via the matrix multiplication of two selected adjacency matrices. The adjacency matrix selection is a weighted sum of candidate adjacency matrices obtained by 1 × 1 convolution. Then, GTN learn node representation via convolution on the learnt meta-path graphs.

The following code reproduces the GTN model with dataset ACM. ACM has two GT Layers and other training parameters follow the default settings in the original paper and code. You can download data from this [link](https://drive.google.com/file/d/1qOZ3QjqWMIIvWjzrIdRe3EA4iKzPi6S5/view).

In the origin code provided by the author, they also implement a "sparse" model, which I think the main difference is to use sparse adjacency matrix to compute the meta path. The following code is the normal model without applying sparse to the adjacency matrix. 

The result can be found in lab_report in the repo.

In [11]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import math
import pdb
import pickle
import argparse
from sklearn.metrics import f1_score

In [12]:
class GTLayer(nn.Module):
    def __init__(self, in_channels, out_channels, first=True):
        super(GTLayer, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.first = first
        if self.first == True:
            self.gt_conv1 = GTConv(in_channels, out_channels)
            self.gt_conv2 = GTConv(in_channels, out_channels)
        else:
            self.gt_conv = GTConv(in_channels, out_channels)
    
    def forward(self, A, H_=None):
        # The first GTLayer needs two convolutional layers since they will multiply the two matrixes; 
        # The following GTLayers only need one and will multiply with the before matrix.
        if self.first == True:
            H = torch.bmm(self.gt_conv1(A),self.gt_conv2(A)) 
            W = [(F.softmax(self.gt_conv1.weight, dim=1)).detach(),(F.softmax(self.gt_conv1.weight, dim=1)).detach()]
        else:
            H = torch.bmm(H_, self.gt_conv(A))
            W = [(F.softmax(self.gt_conv.weight, dim=1)).detach()]
        return H,W

# select adjacency matrix: a weighted sum of candidate adjacency matrices obtained by 1 * 1 convolution with non-negative weights from softmax.
class GTConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GTConv, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels 
        self.weight = nn.Parameter(torch.Tensor(out_channels,in_channels,1,1))
        self.bias = None
        self.scale = nn.Parameter(torch.Tensor([0.1]), requires_grad=False)
        self.reset_parameters()
        
    def reset_parameters(self):
        n = self.in_channels
        nn.init.constant_(self.weight, 0.1)
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, A):
        X = A*F.softmax(self.weight, dim=1)
        M = torch.sum(X, dim=1)
        return M


In [13]:
class GTN(nn.Module):
    
    def __init__(self, num_edge, num_channels, w_in, w_out, num_class,norm):
        super(GTN, self).__init__()
        self.num_edge = num_edge
        self.num_channels = num_channels
        self.w_in = w_in
        self.w_out = w_out
        self.num_class = num_class
        self.is_norm = norm
        layers = []
        # Two GTLayers
        layers.append(GTLayer(num_edge, num_channels, first=True))
        layers.append(GTLayer(num_edge, num_channels, first=False))
        self.layers = nn.ModuleList(layers)
        self.weight = nn.Parameter(torch.Tensor(w_in, w_out))
        self.bias = nn.Parameter(torch.Tensor(w_out))
        self.loss = nn.CrossEntropyLoss()
        self.linear1 = nn.Linear(self.w_out*self.num_channels, self.w_out)
        self.linear2 = nn.Linear(self.w_out, self.num_class)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

        #GCN model:
    def gcn_conv(self,X,H):
        X = torch.mm(X, self.weight)
        H = self.norm(H, add=True)
        return torch.mm(H.t(),X)

    def normalization(self, H):
        for i in range(self.num_channels):
            if i==0:
                H_ = self.norm(H[i,:,:]).unsqueeze(0)
            else:
                H_ = torch.cat((H_,self.norm(H[i,:,:]).unsqueeze(0)), dim=0)
        return H_

    def norm(self, H, add=False):
        H = H.t()
        if add == False:
            H = H*((torch.eye(H.shape[0])==0).type(torch.FloatTensor))
        else:
            H = H*((torch.eye(H.shape[0])==0).type(torch.FloatTensor)) + torch.eye(H.shape[0]).type(torch.FloatTensor)
        deg = torch.sum(H, dim=1)
        deg_inv = deg.pow(-1)
        deg_inv[deg_inv == float('inf')] = 0
        deg_inv = deg_inv*torch.eye(H.shape[0]).type(torch.FloatTensor)
        H = torch.mm(deg_inv,H)
        H = H.t()
        return H

    def forward(self, A, X, target_x, target):
        A = A.unsqueeze(0).permute(0,3,1,2) 
        # Apply two GT layers
        Ws = []
        H, W = self.layers[0](A)
        Ws.append(W)
        H = self.normalization(H)
        H, W = self.layers[1](A, H)
        Ws.append(W)
        #Apply GCN model to the learned H
        for i in range(self.num_channels):
            if i==0:
                X_ = F.relu(self.gcn_conv(X,H[i]))
            else:
                X_tmp = F.relu(self.gcn_conv(X,H[i]))
                X_ = torch.cat((X_,X_tmp), dim=1)
        X_ = self.linear1(X_)
        X_ = F.relu(X_)
        y = self.linear2(X_[target_x])
        loss = self.loss(y, target)
        return loss, y, Ws

In [14]:
epochs = 40
node_dim = 64
num_channels = 2
lr = 0.005
weight_decay = 0.001
norm = True
adaptive_lr = True
dataset = 'ACM'

In [None]:
# Processing data
with open('data/'+dataset+'/node_features.pkl','rb') as f:
    node_features = pickle.load(f)
with open('data/'+dataset+'/edges.pkl','rb') as f:
    edges = pickle.load(f)
with open('data/'+dataset+'/labels.pkl','rb') as f:
    labels = pickle.load(f)
num_nodes = edges[0].shape[0]

for i,edge in enumerate(edges):
    if i ==0:
        # using dense matrix
        A = torch.from_numpy(edge.todense()).type(torch.FloatTensor).unsqueeze(-1)
    else:
        A = torch.cat([A,torch.from_numpy(edge.todense()).type(torch.FloatTensor).unsqueeze(-1)], dim=-1)

A = torch.cat([A,torch.eye(num_nodes).type(torch.FloatTensor).unsqueeze(-1)], dim=-1)


node_features = torch.from_numpy(node_features).type(torch.FloatTensor)
train_node = torch.from_numpy(np.array(labels[0])[:,0]).type(torch.LongTensor)
train_target = torch.from_numpy(np.array(labels[0])[:,1]).type(torch.LongTensor)
valid_node = torch.from_numpy(np.array(labels[1])[:,0]).type(torch.LongTensor)
valid_target = torch.from_numpy(np.array(labels[1])[:,1]).type(torch.LongTensor)
test_node = torch.from_numpy(np.array(labels[2])[:,0]).type(torch.LongTensor)
test_target = torch.from_numpy(np.array(labels[2])[:,1]).type(torch.LongTensor)

num_classes = torch.max(train_target).item()+1

#create model:
model = GTN(num_edge=A.shape[-1],
                    num_channels=num_channels,
                    w_in = node_features.shape[1],
                    w_out = node_dim,
                    num_class=num_classes,
                    norm=norm)

#using adaptive learning rate:
optimizer = torch.optim.Adam([{'params':model.weight},
                                {'params':model.linear1.parameters()},
                                {'params':model.linear2.parameters()},
                                {"params":model.layers.parameters(), "lr":0.5}
                                ], lr=0.005, weight_decay=0.001)
#loss:
loss = nn.CrossEntropyLoss()

# Traing:
for i in range(epochs):
    for param_group in optimizer.param_groups:
        if param_group['lr'] > 0.005:
            param_group['lr'] = param_group['lr'] * 0.9
    print('Epoch:  ',i+1)
    model.zero_grad()
    model.train()
    loss,y_train,Ws = model(A, node_features, train_node, train_target)
    train_f1 = torch.tensor(f1_score(torch.argmax(y_train.detach(),dim=1), train_target,  average='macro')).cpu().numpy()
    print('Train Loss: {}, Macro_F1: {}'.format(loss.detach().cpu().numpy(), train_f1))
    loss.backward()
    optimizer.step()
    model.eval()
    # Valid and Test every epoch:
    with torch.no_grad():
        val_loss, y_valid,_ = model.forward(A, node_features, valid_node, valid_target)
        val_f1 = torch.tensor(f1_score(torch.argmax(y_valid,dim=1), valid_target, average='macro')).cpu().numpy()
        print('Valid Loss: {}, Macro_F1: {}'.format(val_loss.detach().cpu().numpy(), val_f1))
        test_loss, y_test,W = model.forward(A, node_features, test_node, test_target)
        test_f1 = torch.tensor(f1_score(torch.argmax(y_test,dim=1), test_target, average='macro')).cpu().numpy()
        print('Test Loss: {}, Macro_F1: {}\n'.format(test_loss.detach().cpu().numpy(), test_f1))