In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Prepare data
data = dataset[0]

# Define a 2-layer GCN
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Epoch 0, Loss: 1.9408648014068604
Epoch 10, Loss: 0.5527430176734924
Epoch 20, Loss: 0.09263462573289871
Epoch 30, Loss: 0.020280536264181137
Epoch 40, Loss: 0.007526488043367863
Epoch 50, Loss: 0.004197495058178902
Epoch 60, Loss: 0.002971513429656625
Epoch 70, Loss: 0.0023829475976526737
Epoch 80, Loss: 0.002031512325629592
Epoch 90, Loss: 0.0017838183557614684
Training complete!


## Explanation:
GCN aggregates features from a node’s neighbors using graph convolutions. This allows the network to learn representations based on both node features and graph structure.
The Cora dataset is used to classify nodes into one of 7 research topics.

## Questions (1 point each):

1. What would happen if we added more GCN layers (e.g., 3 layers instead of 2)? How would this affect over-smoothing?
2. What would happen if we used a larger hidden dimension (e.g., 64 instead of 16)? How would this impact the model's capacity?
3. What would happen if we replaced ReLU activation with a sigmoid function? Would the performance change?

4. What would happen if we trained on only 10% of the nodes and tested on the remaining 90%? How would the performance be affected?
5. What would happen if we used a different optimizer (e.g., RMSprop) instead of Adam? Would it affect the convergence speed?

Extra credit: 
1. What would happen if we used edge weights (non-binary) in the adjacency matrix? How would it affect message passing?
2. What would happen if we removed the log-softmax function in the output layer? Would the loss function still work correctly?

## No points, just for you to think about:
1. What would happen if we applied dropout to the node features during training? How would it affect the model’s generalization?
2. What would happen if we used mean-pooling instead of summing the messages in the GCN layers?
3. What would happen if we pre-trained the node features using a different algorithm, like Node2Vec, before feeding them into the GCN?


## **Answers To Questions**

1. If we added more GCN layers, the complexity of the model will increase and the model will be more prone to overfitting. This would mean that we will expect a considerable decrease in the training loss and this would result in the model becoming more prone to over-smoothing.

In [7]:
# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Prepare data
data = dataset[0]

# Define a 3-layer GCN
class GCN3(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN3, self).__init__()
        int_dim = 20
        self.conv1 = GCNConv(input_dim, int_dim)
        self.conv2 = GCNConv(int_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.conv3(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
model = GCN3(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Epoch 0, Loss: 1.947688102722168
Epoch 10, Loss: 0.5603798031806946
Epoch 20, Loss: 0.03717956691980362
Epoch 30, Loss: 0.004519958049058914
Epoch 40, Loss: 0.0011941419215872884
Epoch 50, Loss: 0.0005936055094935
Epoch 60, Loss: 0.0004178214294370264
Epoch 70, Loss: 0.0003418547858018428
Epoch 80, Loss: 0.0003000610158778727
Epoch 90, Loss: 0.0002725078084040433
Training complete!


2. Increasing the number of hidden dimensions will result in an increase in the model's capacity. Providing the GCN model will more features will allow the model to learn more patterns, however it will also become prone to overfitting. It will also result in the model having to do more calculations, increasing the computational and memory cost.

In [8]:
# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=64, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Epoch 0, Loss: 1.9554576873779297
Epoch 10, Loss: 0.08689606934785843
Epoch 20, Loss: 0.003936638589948416
Epoch 30, Loss: 0.0007678308174945414
Epoch 40, Loss: 0.0003390878264326602
Epoch 50, Loss: 0.00022610061569139361
Epoch 60, Loss: 0.00018675034516490996
Epoch 70, Loss: 0.00016744981985539198
Epoch 80, Loss: 0.000155505578732118
Epoch 90, Loss: 0.00014664324407931417
Training complete!


3. In the code below, the sigmoid activation function is implemented in between the GCN layers. Comparing the training loss in comparison to the model using the relu activation function, we can see that the model performance does change and the training losses increase for each epoch.

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Prepare data
data = dataset[0]

# Define a 2-layer GCN
class GCNSig(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNSig, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.sigmoid(x)
        x = self.conv2(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
model = GCNSig(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")


Epoch 0, Loss: 2.048659324645996
Epoch 10, Loss: 1.404330849647522
Epoch 20, Loss: 0.9214096069335938
Epoch 30, Loss: 0.5692194700241089
Epoch 40, Loss: 0.34371981024742126
Epoch 50, Loss: 0.21529003977775574
Epoch 60, Loss: 0.14460553228855133
Epoch 70, Loss: 0.10453751683235168
Epoch 80, Loss: 0.08032123744487762
Epoch 90, Loss: 0.0645800456404686
Training complete!


4. If we trained on only 10% of the nodes and tested on the remaining 90%, we should expect a decrease in performance and a possiblity of underfitting the trained model. As the model will not have much data to work with, it will not be able to generalize results and be unable to navigate complex scenarios. This will result in an increase in the test loss.

5. Based on the result yielded by the code below, using a different optimizer such as RMSprop instead of Adam does not affect the convergence speed, as we can see a consistent decrease in the test loss, indicating that the optimal value is being approached.

In [33]:
# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")

Epoch 0, Loss: 1.9566665887832642
Epoch 10, Loss: 0.04365529119968414
Epoch 20, Loss: 0.01677924394607544
Epoch 30, Loss: 0.009819929488003254
Epoch 40, Loss: 0.006707995664328337
Epoch 50, Loss: 0.00497392937541008
Epoch 60, Loss: 0.003880862146615982
Epoch 70, Loss: 0.003134372178465128
Epoch 80, Loss: 0.002595312427729368
Epoch 90, Loss: 0.002189916791394353
Training complete!


Extra credit:

1. What would happen if we used edge weights (non-binary) in the adjacency matrix? How would it affect message passing?
2. What would happen if we removed the log-softmax function in the output layer? Would the loss function still work correctly?

**Solve the first question**

2. In the code below, when we remove the log-softmax function in the output layer, the loss function still seems to be working correctly as we can see the training loss decrease in each iteration. One difference is that the loss value decreases faster, indicating that the model is at risk of overfitting.

In [35]:
# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x
        
# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Training complete!")

Epoch 0, Loss: 1.9403178691864014
Epoch 10, Loss: 0.05999862030148506
Epoch 20, Loss: 0.022603914141654968
Epoch 30, Loss: 0.013168002478778362
Epoch 40, Loss: 0.00894375704228878
Epoch 50, Loss: 0.006597674917429686
Epoch 60, Loss: 0.005127208773046732
Epoch 70, Loss: 0.004125987645238638
Epoch 80, Loss: 0.003404885996133089
Epoch 90, Loss: 0.002864376874640584
Training complete!
