In [None]:
#Check the PyTorch and Cuda version
!python -c "import torch; print(torch.__version__)"
!python -c "import torch; print(torch.version.cuda)"

1.10.0+cu111
11.1


In [None]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install torch-cluster -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html

In [2]:
#First data characteristics
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='/tmp/CiteSeer', name='CiteSeer',transform=NormalizeFeatures())

data = dataset[0]
print(data)

print(f'Number of nodes: {data.num_nodes}')
print(f'Nodes features: {data.num_node_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of edges: {data.num_edges}')
print(f'Avarage degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Training nodes: {data.train_mask.sum()}')
print(f'Validation nodes: {data.val_mask.sum()}')
print(f'Test nodes: {data.test_mask.sum()}')
print(f'Isolated nodes: {data.has_isolated_nodes()}')
print(f'Loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...


Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])
Number of nodes: 3327
Nodes features: 3703
Number of classes: 6
Number of edges: 9104
Avarage degree: 2.74
Training nodes: 120
Validation nodes: 500
Test nodes: 1000
Isolated nodes: True
Loops: False
Is undirected: True


Done!


**Applying self-attention to all nodes**

Using GNNs, the prediction of each node is done using neural network layers on an aggregation of neighboring nodes. Now we are going to use the building block of a Transformer, applying self-attention to all nodes of the graph. A part of these nodes will be used for training and another for validation and test.

In [3]:
#Add a new batch dimension of size 1 to the tensor
import torch

data1=torch.unsqueeze(data.x, 1)
data1.shape

torch.Size([3327, 1, 3703])

In [4]:
#Convert the input features dimension to the model features
#Apply self-attention using PyTorch MultiheadAttention
#Transform the encoder output to the output dimension
#Set up the model and tensors in cuda
import torch.nn
from torch.nn import Linear
import torch.nn.functional as F

class Self_Att(torch.nn.Module):
    def __init__(self):
        super(Self_Att, self).__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(3703,120)
        self.attention1 = torch.nn.MultiheadAttention(120,6)
        self.lin2 = Linear(120, dataset.num_classes)

    def forward(self,x):
        x = self.lin1(x)
        x = F.dropout(x, p=0.8, training=self.training)
        x, attn_output_weights = self.attention1(x,x,x)
        x = x.relu()
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin2(x)
        x=torch.squeeze(x)#Remove the dimension of size 1
        return x,attn_output_weights

use_cuda = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

model = Self_Att()
model.cuda()

data=data.cuda()
data1=data1.cuda()

print(model)

Using device: cuda
Self_Att(
  (lin1): Linear(in_features=3703, out_features=120, bias=True)
  (attention1): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=120, out_features=120, bias=True)
  )
  (lin2): Linear(in_features=120, out_features=6, bias=True)
)


In [5]:
#For training, the cross entropy loss combines LogSoftmax and NLLLoss in one single class
#input is expected to contain raw, unnormalized scores for each class
#target a class index in the range for each value of a 1D tensor of size minibatch
#For testing, compares the class with highest probability with the labels and counts the correct predictions
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out,attn_output_weights = model(data1)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss,attn_output_weights

def test():
      model.eval()
      out,attn_output_weights = model(data1)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc,attn_output_weights

for epoch in range(1, 201):
    loss,attn_output_weights = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    #print(attn_output_weights)

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 1.7938
Epoch: 002, Loss: 1.7929
Epoch: 003, Loss: 1.7911
Epoch: 004, Loss: 1.7897
Epoch: 005, Loss: 1.7937
Epoch: 006, Loss: 1.7922
Epoch: 007, Loss: 1.7896
Epoch: 008, Loss: 1.7902
Epoch: 009, Loss: 1.7926
Epoch: 010, Loss: 1.7886
Epoch: 011, Loss: 1.7932
Epoch: 012, Loss: 1.7869
Epoch: 013, Loss: 1.7948
Epoch: 014, Loss: 1.7935
Epoch: 015, Loss: 1.7851
Epoch: 016, Loss: 1.7965
Epoch: 017, Loss: 1.7934
Epoch: 018, Loss: 1.7928
Epoch: 019, Loss: 1.7899
Epoch: 020, Loss: 1.7893
Epoch: 021, Loss: 1.7888
Epoch: 022, Loss: 1.7948
Epoch: 023, Loss: 1.7890
Epoch: 024, Loss: 1.7910
Epoch: 025, Loss: 1.8000
Epoch: 026, Loss: 1.7942
Epoch: 027, Loss: 1.7910
Epoch: 028, Loss: 1.7902
Epoch: 029, Loss: 1.7930
Epoch: 030, Loss: 1.7956
Epoch: 031, Loss: 1.7911
Epoch: 032, Loss: 1.7916
Epoch: 033, Loss: 1.7913
Epoch: 034, Loss: 1.7943
Epoch: 035, Loss: 1.7922
Epoch: 036, Loss: 1.7918
Epoch: 037, Loss: 1.7906
Epoch: 038, Loss: 1.7917
Epoch: 039, Loss: 1.7938
Epoch: 040, Loss: 1.7932


In [6]:
test_acc,attn_output_weights = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.1810


In [None]:
attn_output_weights[0]

tensor([[0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
        [0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
        [0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
        ...,
        [0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
        [0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
        [0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003]],
       device='cuda:0', grad_fn=<SelectBackward0>)

Self-attention to all nodes, accuracy 18.10% (training loss 1.799).


**Applying self-attention with an attention mask (only self-loops)**

In [8]:
#Creating an attention mask with all ones but 0 in the diagonal
att_mask=torch.ones(3327,3327)
att_mask.fill_diagonal_(0)
att_mask=att_mask.bool()
att_mask

tensor([[False,  True,  True,  ...,  True,  True,  True],
        [ True, False,  True,  ...,  True,  True,  True],
        [ True,  True, False,  ...,  True,  True,  True],
        ...,
        [ True,  True,  True,  ..., False,  True,  True],
        [ True,  True,  True,  ...,  True, False,  True],
        [ True,  True,  True,  ...,  True,  True, False]])

In [9]:
#Convert the input features dimension to the model features
#Apply self-attention using PyTorch MultiheadAttention
#Transform the encoder output to the output dimension
#Set up the model and tensors in cuda
import torch.nn
from torch.nn import Linear
import torch.nn.functional as F

class Self_Att(torch.nn.Module):
    def __init__(self):
        super(Self_Att, self).__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(3703,120)
        self.attention1 = torch.nn.MultiheadAttention(120,6)
        self.lin2 = Linear(120, dataset.num_classes)

    def forward(self,x,att_mask):
        x = self.lin1(x)
        x = F.dropout(x, p=0.8, training=self.training)
        x, attn_output_weights = self.attention1(x,x,x,attn_mask=att_mask)
        x = x.relu()
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin2(x)
        x=torch.squeeze(x)#Remove the dimension of size 1
        return x,attn_output_weights


use_cuda = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

model = Self_Att()
model.cuda()

data=data.cuda()
data1=data1.cuda()
att_mask=att_mask.cuda()

Using device: cuda


In [10]:
#For training, the cross entropy loss combines LogSoftmax and NLLLoss in one single class
#input is expected to contain raw, unnormalized scores for each class
#target a class index in the range for each value of a 1D tensor of size minibatch
#For testing, compares the class with highest probability with the labels and counts the correct predictions
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out,attn_output_weights = model(data1,att_mask)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss,attn_output_weights

def test():
      model.eval()
      out,attn_output_weights = model(data1,att_mask)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc,attn_output_weights

for epoch in range(1, 201):
    loss,attn_output_weights = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    #print(attn_output_weights)

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 1.7941
Epoch: 002, Loss: 1.7916
Epoch: 003, Loss: 1.7879
Epoch: 004, Loss: 1.7784
Epoch: 005, Loss: 1.7702
Epoch: 006, Loss: 1.7506
Epoch: 007, Loss: 1.7392
Epoch: 008, Loss: 1.6970
Epoch: 009, Loss: 1.6334
Epoch: 010, Loss: 1.5111
Epoch: 011, Loss: 1.3679
Epoch: 012, Loss: 1.2415
Epoch: 013, Loss: 1.0849
Epoch: 014, Loss: 0.8486
Epoch: 015, Loss: 0.7535
Epoch: 016, Loss: 0.5429
Epoch: 017, Loss: 0.4522
Epoch: 018, Loss: 0.2763
Epoch: 019, Loss: 0.2294
Epoch: 020, Loss: 0.1670
Epoch: 021, Loss: 0.1116
Epoch: 022, Loss: 0.0832
Epoch: 023, Loss: 0.0541
Epoch: 024, Loss: 0.0988
Epoch: 025, Loss: 0.0376
Epoch: 026, Loss: 0.0165
Epoch: 027, Loss: 0.0216
Epoch: 028, Loss: 0.0305
Epoch: 029, Loss: 0.0351
Epoch: 030, Loss: 0.0577
Epoch: 031, Loss: 0.0118
Epoch: 032, Loss: 0.0228
Epoch: 033, Loss: 0.0342
Epoch: 034, Loss: 0.0363
Epoch: 035, Loss: 0.1472
Epoch: 036, Loss: 0.0116
Epoch: 037, Loss: 0.1232
Epoch: 038, Loss: 0.0240
Epoch: 039, Loss: 0.1376
Epoch: 040, Loss: 0.1174


In [11]:
test_acc,attn_output_weights = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.5130


In [12]:
attn_output_weights

tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.]]], device='cuda:0',
       grad_fn=<DivBackward0>)

Self-attention to only the same node, accuracy 51.30% (training loss 0.01/0.02/0.03).

**Using Transformers with an attention mask (representing the graph structure)**

In [13]:
import numpy as np
import networkx as nx
from torch_geometric.utils import to_networkx

G = to_networkx(data, to_undirected=False)
att_mask=nx.to_numpy_matrix(G)
np.fill_diagonal(att_mask,1)

zero_indices = att_mask == 0
non_zero_indices = att_mask != 0
att_mask[non_zero_indices] = 0
att_mask[zero_indices] = 1

att_mask=torch.from_numpy(att_mask)
att_mask=att_mask.bool()
att_mask

tensor([[False,  True,  True,  ...,  True,  True,  True],
        [ True, False,  True,  ...,  True,  True,  True],
        [ True,  True, False,  ...,  True,  True,  True],
        ...,
        [ True,  True,  True,  ..., False,  True,  True],
        [ True,  True,  True,  ...,  True, False,  True],
        [ True,  True,  True,  ...,  True,  True, False]])

In [14]:
#Convert the input features dimension to the model features
#Apply self-attention using PyTorch MultiheadAttention
#Transform the encoder output to the output dimension
#Set up the model and tensors in cuda
import torch.nn
from torch.nn import Linear
import torch.nn.functional as F

class Self_Att(torch.nn.Module):
    def __init__(self):
        super(Self_Att, self).__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(3703,120)
        self.attention1 = torch.nn.MultiheadAttention(120,6)
        self.lin2 = Linear(120, dataset.num_classes)

    def forward(self,x,att_mask):
        x = self.lin1(x)
        x = F.dropout(x, p=0.8, training=self.training)
        x, attn_output_weights = self.attention1(x,x,x,attn_mask=att_mask)
        x = x.relu()
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.lin2(x)
        x=torch.squeeze(x)#Remove the dimension of size 1
        return x,attn_output_weights


use_cuda = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

model = Self_Att()
model.cuda()

data=data.cuda()
data1=data1.cuda()
att_mask=att_mask.cuda()

Using device: cuda


In [15]:
#For training, the cross entropy loss combines LogSoftmax and NLLLoss in one single class
#input is expected to contain raw, unnormalized scores for each class
#target a class index in the range for each value of a 1D tensor of size minibatch
#For testing, compares the class with highest probability with the labels and counts the correct predictions
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out,attn_output_weights = model(data1,att_mask)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss,attn_output_weights

def test():
      model.eval()
      out,attn_output_weights = model(data1,att_mask)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc,attn_output_weights

for epoch in range(1, 201):
    loss,attn_output_weights = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    #print(attn_output_weights)

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 1.7941
Epoch: 002, Loss: 1.7919
Epoch: 003, Loss: 1.7885
Epoch: 004, Loss: 1.7829
Epoch: 005, Loss: 1.7822
Epoch: 006, Loss: 1.7675
Epoch: 007, Loss: 1.7458
Epoch: 008, Loss: 1.7022
Epoch: 009, Loss: 1.6561
Epoch: 010, Loss: 1.5725
Epoch: 011, Loss: 1.4412
Epoch: 012, Loss: 1.2772
Epoch: 013, Loss: 1.0938
Epoch: 014, Loss: 0.9235
Epoch: 015, Loss: 0.7221
Epoch: 016, Loss: 0.5210
Epoch: 017, Loss: 0.3721
Epoch: 018, Loss: 0.2689
Epoch: 019, Loss: 0.2426
Epoch: 020, Loss: 0.1628
Epoch: 021, Loss: 0.0889
Epoch: 022, Loss: 0.0819
Epoch: 023, Loss: 0.0691
Epoch: 024, Loss: 0.0509
Epoch: 025, Loss: 0.0445
Epoch: 026, Loss: 0.0269
Epoch: 027, Loss: 0.0571
Epoch: 028, Loss: 0.0443
Epoch: 029, Loss: 0.0875
Epoch: 030, Loss: 0.0286
Epoch: 031, Loss: 0.0082
Epoch: 032, Loss: 0.0628
Epoch: 033, Loss: 0.0840
Epoch: 034, Loss: 0.0192
Epoch: 035, Loss: 0.1089
Epoch: 036, Loss: 0.0270
Epoch: 037, Loss: 0.0133
Epoch: 038, Loss: 0.0566
Epoch: 039, Loss: 0.0465
Epoch: 040, Loss: 0.0136


In [16]:
test_acc,attn_output_weights = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.6540


In [17]:
data.edge_index[:,0:10]

tensor([[   0,    1,    1,    1,    1,    1,    2,    3,    3,    4],
        [ 628,  158,  486, 1097, 2919, 2933, 3285, 1431, 3219,  467]],
       device='cuda:0')

In [18]:
print(attn_output_weights)

tensor([[[0.5000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.1667, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.5000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.2500, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.5000]]],
       device='cuda:0', grad_fn=<DivBackward0>)


In [19]:
def model_summary(model):
    
    model_params_list = list(model.named_parameters())
    print("----------------------------------------------------------------")
    line_new = "{:>20}  {:>25} {:>15}".format("Layer.Parameter", "Param Tensor Shape", "Param #")
    print(line_new)
    print("----------------------------------------------------------------")
    for elem in model_params_list:
        p_name = elem[0] 
        p_shape = list(elem[1].size())
        p_count = torch.tensor(elem[1].size()).prod().item()
        line_new = "{:>20}  {:>25} {:>15}".format(p_name, str(p_shape), str(p_count))
        print(line_new)
    print("----------------------------------------------------------------")
    total_params = sum([param.nelement() for param in model.parameters()])
    print("Total params:", total_params)
    num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params:", num_trainable_params)
    print("Non-trainable params:", total_params - num_trainable_params)

model_summary(model)

----------------------------------------------------------------
     Layer.Parameter         Param Tensor Shape         Param #
----------------------------------------------------------------
         lin1.weight                [120, 3703]          444360
           lin1.bias                      [120]             120
attention1.in_proj_weight                 [360, 120]           43200
attention1.in_proj_bias                      [360]             360
attention1.out_proj.weight                 [120, 120]           14400
attention1.out_proj.bias                      [120]             120
         lin2.weight                   [6, 120]             720
           lin2.bias                        [6]               6
----------------------------------------------------------------
Total params: 503286
Trainable params: 503286
Non-trainable params: 0


Self-attention to the neighboring nodes, accuracy 65.40% (training loss 0.01/0.02/0.03).
Comparing the model with the GCN (see below), we obtain less accuracy and less loss (0.02 aprox vs 0.4) with far more parameters (503286 vs 59366), probably because of overfitting. Using a dropout probability on attn_output_weights of 0.8 we get 63.1 % and 0.1/0.2 of training loss.

**The basic model but in Cuda for comparisions**

In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv



class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)

model.cuda()
data=data.cuda()


In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x,data.edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x,data.edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc


for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 1.7915
Epoch: 002, Loss: 1.7875
Epoch: 003, Loss: 1.7823
Epoch: 004, Loss: 1.7752
Epoch: 005, Loss: 1.7683
Epoch: 006, Loss: 1.7605
Epoch: 007, Loss: 1.7525
Epoch: 008, Loss: 1.7423
Epoch: 009, Loss: 1.7387
Epoch: 010, Loss: 1.7282
Epoch: 011, Loss: 1.7168
Epoch: 012, Loss: 1.7133
Epoch: 013, Loss: 1.6932
Epoch: 014, Loss: 1.6868
Epoch: 015, Loss: 1.6759
Epoch: 016, Loss: 1.6629
Epoch: 017, Loss: 1.6535
Epoch: 018, Loss: 1.6446
Epoch: 019, Loss: 1.6305
Epoch: 020, Loss: 1.6317
Epoch: 021, Loss: 1.6121
Epoch: 022, Loss: 1.6036
Epoch: 023, Loss: 1.5931
Epoch: 024, Loss: 1.5798
Epoch: 025, Loss: 1.5575
Epoch: 026, Loss: 1.5395
Epoch: 027, Loss: 1.5365
Epoch: 028, Loss: 1.5098
Epoch: 029, Loss: 1.4833
Epoch: 030, Loss: 1.4816
Epoch: 031, Loss: 1.4661
Epoch: 032, Loss: 1.4568
Epoch: 033, Loss: 1.4259
Epoch: 034, Loss: 1.4174
Epoch: 035, Loss: 1.4302
Epoch: 036, Loss: 1.4050
Epoch: 037, Loss: 1.3624
Epoch: 038, Loss: 1.3600
Epoch: 039, Loss: 1.3464
Epoch: 040, Loss: 1.2972


In [None]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7090


In [None]:
def model_summary(model):
    
    model_params_list = list(model.named_parameters())
    print("----------------------------------------------------------------")
    line_new = "{:>20}  {:>25} {:>15}".format("Layer.Parameter", "Param Tensor Shape", "Param #")
    print(line_new)
    print("----------------------------------------------------------------")
    for elem in model_params_list:
        p_name = elem[0] 
        p_shape = list(elem[1].size())
        p_count = torch.tensor(elem[1].size()).prod().item()
        line_new = "{:>20}  {:>25} {:>15}".format(p_name, str(p_shape), str(p_count))
        print(line_new)
    print("----------------------------------------------------------------")
    total_params = sum([param.nelement() for param in model.parameters()])
    print("Total params:", total_params)
    num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params:", num_trainable_params)
    print("Non-trainable params:", total_params - num_trainable_params)

model_summary(model)

----------------------------------------------------------------
     Layer.Parameter         Param Tensor Shape         Param #
----------------------------------------------------------------
          conv1.bias                       [16]              16
    conv1.lin.weight                 [16, 3703]           59248
          conv2.bias                        [6]               6
    conv2.lin.weight                    [6, 16]              96
----------------------------------------------------------------
Total params: 59366
Trainable params: 59366
Non-trainable params: 0
