In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Prepare data
data = dataset[0]

# Define a 2-layer GCN
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


Epoch 0, Loss: 1.9427908658981323
Epoch 10, Loss: 0.47523432970046997
Epoch 20, Loss: 0.07317931950092316
Epoch 30, Loss: 0.016904689371585846
Epoch 40, Loss: 0.00686116237193346
Epoch 50, Loss: 0.0040501197800040245
Epoch 60, Loss: 0.0029643250163644552
Epoch 70, Loss: 0.002416849136352539
Epoch 80, Loss: 0.0020750986877828836
Epoch 90, Loss: 0.0018283732933923602
Training complete!


## Explanation:
GCN aggregates features from a node’s neighbors using graph convolutions. This allows the network to learn representations based on both node features and graph structure.
The Cora dataset is used to classify nodes into one of 7 research topics.

## Questions (1 point each):

1. What would happen if we added more GCN layers (e.g., 3 layers instead of 2)? How would this affect over-smoothing?
2. What would happen if we used a larger hidden dimension (e.g., 64 instead of 16)? How would this impact the model's capacity?
3. What would happen if we replaced ReLU activation with a sigmoid function? Would the performance change?

4. What would happen if we trained on only 10% of the nodes and tested on the remaining 90%? How would the performance be affected?
5. What would happen if we used a different optimizer (e.g., RMSprop) instead of Adam? Would it affect the convergence speed?

Extra credit: 
1. What would happen if we used edge weights (non-binary) in the adjacency matrix? How would it affect message passing?
2. What would happen if we removed the log-softmax function in the output layer? Would the loss function still work correctly?

## No points, just for you to think about:
1. What would happen if we applied dropout to the node features during training? How would it affect the model’s generalization?
2. What would happen if we used mean-pooling instead of summing the messages in the GCN layers?
3. What would happen if we pre-trained the node features using a different algorithm, like Node2Vec, before feeding them into the GCN?


**Answers To Questions**

1. If we added more GCN layers, the complexity of the model will increase and the model will be more prone to overfitting. This would mean that we will expect a considerable decrease in the training loss and this would result in the model becoming more prone to over-smoothing.

In [2]:
# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Prepare data
data = dataset[0]

# Define a 3-layer GCN
class GCN3(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN3, self).__init__()
        int_dim = 20
        self.conv1 = GCNConv(input_dim, int_dim)
        self.conv2 = GCNConv(int_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.conv3(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
model = GCN3(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Epoch 0, Loss: 1.9482845067977905
Epoch 10, Loss: 0.7804139256477356
Epoch 20, Loss: 0.092952661216259
Epoch 30, Loss: 0.007593801245093346
Epoch 40, Loss: 0.001376834698021412
Epoch 50, Loss: 0.0005857640644535422
Epoch 60, Loss: 0.00037658034125342965
Epoch 70, Loss: 0.00029861353687010705
Epoch 80, Loss: 0.0002593178942333907
Epoch 90, Loss: 0.0002342205261811614
Training complete!


2. Increasing the number of hidden dimensions will result in an increase in the model's capacity. Providing the GCN model will more features will allow the model to learn more patterns, however it will also become prone to overfitting. It will also result in the model having to do more calculations, increasing the computational and memory cost.

In [3]:
# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=64, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Epoch 0, Loss: 1.9578262567520142
Epoch 10, Loss: 0.07604345679283142
Epoch 20, Loss: 0.0030139554291963577
Epoch 30, Loss: 0.000520452274940908
Epoch 40, Loss: 0.00023175829846877605
Epoch 50, Loss: 0.0001611936022527516
Epoch 60, Loss: 0.00013470533303916454
Epoch 70, Loss: 0.00012161435006419197
Epoch 80, Loss: 0.0001135545753641054
Epoch 90, Loss: 0.00010755476978374645
Training complete!


3. In the code below, the sigmoid activation function is implemented in between the GCN layers. Comparing the training loss in comparison to the model using the relu activation function, we can see that the model performance does change and the training losses increase for each epoch.

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Prepare data
data = dataset[0]

# Define a 2-layer GCN
class GCNSig(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNSig, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.sigmoid(x)
        x = self.conv2(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
model = GCNSig(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Epoch 0, Loss: 2.0807044506073
Epoch 10, Loss: 1.443735957145691
Epoch 20, Loss: 0.9945052266120911
Epoch 30, Loss: 0.6488593816757202
Epoch 40, Loss: 0.4072495400905609
Epoch 50, Loss: 0.2584472596645355
Epoch 60, Loss: 0.17290417850017548
Epoch 70, Loss: 0.1236301138997078
Epoch 80, Loss: 0.0938212051987648
Epoch 90, Loss: 0.07458357512950897
Training complete!


4. If we trained on only 10% of the nodes and tested on the remaining 90%, we should expect a decrease in performance and a possiblity of underfitting the trained model. As the model will not have much data to work with, it will not be able to generalize results and be unable to navigate complex scenarios. This will result in an increase in the test loss.

5. Based on the result yielded by the code below, using a different optimizer such as RMSprop instead of Adam does not affect the convergence speed, as we can see a consistent decrease in the test loss, indicating that the optimal value is being approached.

In [5]:
# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")

Epoch 0, Loss: 1.9432449340820312
Epoch 10, Loss: 0.04805777594447136
Epoch 20, Loss: 0.018372977152466774
Epoch 30, Loss: 0.009083529934287071
Epoch 40, Loss: 0.00571472616866231
Epoch 50, Loss: 0.003999243024736643
Epoch 60, Loss: 0.0029982649721205235
Epoch 70, Loss: 0.002368733985349536
Epoch 80, Loss: 0.0019366717897355556
Epoch 90, Loss: 0.0016226336592808366
Training complete!


**Answers to Extra Credit**

1. If we used edge weights in the adjacency matrix would affect the strength of each connection between nodes and how important message passing is between two or more nodes. This is especially relevant in a social network, where some people could have more influence in others and therefore could hold a higher edge weight in comparison to its neighboring nodes.

2. In the code below, when we remove the log-softmax function in the output layer, the loss function still seems to be working correctly as we can see the training loss decrease in each iteration. One difference is that the loss value decreases faster, indicating that the model is at risk of overfitting.

In [6]:
# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x
        
# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")

Epoch 0, Loss: 1.936279535293579
Epoch 10, Loss: 0.03836328163743019
Epoch 20, Loss: 0.013678977265954018
Epoch 30, Loss: 0.007702929433435202
Epoch 40, Loss: 0.005134463310241699
Epoch 50, Loss: 0.0037412787787616253
Epoch 60, Loss: 0.002883134176954627
Epoch 70, Loss: 0.002308470197021961
Epoch 80, Loss: 0.0018990060780197382
Epoch 90, Loss: 0.0015943074831739068
Training complete!
