# Node Classification

## 1. Import necessary packages

In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


In [2]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.2-py3-none-any.whl (776 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.10.0 pytorch_lightning-2.1.2 torchmetrics-1.2.1


In [3]:
import os
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
import torch

import torch.optim as optim
import torch_geometric
import torch_geometric.nn as geom_nn
import torch_geometric.data as geom_data

## 2. Download the Cora Dataset

In [4]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
CHECKPOINT_PATH = "saved_models"
gnn_layer_by_name = {"GCN": geom_nn.GCNConv}

I. Download the dataset

In [5]:
cora_dataset = torch_geometric.datasets.Planetoid(root="data", name="Cora")

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


II. Visualize the dataset

In [6]:
cora_dataset[0]

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

# Model Construction

## 1. GNN Model

Inputs description: c_in: Dimension of input features; c_hidden: Dimension of hidden features; c_out: Dimension of the output features. num_layers: Number of "hidden" graph layers; layer_name: String of the graph layer to use; dp_rate:  Dropout rate to apply throughout the network; kwargs: Additional arguments for the graph layer (e.g. number of heads for GAT)
            
            
            
            

In [7]:
class GNNModel(nn.Module):
    def __init__(self, c_in, c_hidden, c_out, num_layers=2, layer_name="GCN", dp_rate=0.1, **kwargs):
        super(GNNModel, self).__init__()

        self.gnn_layer = gnn_layer_by_name[layer_name]
        self.layers = nn.ModuleList()

        in_channels, out_channels = c_in, c_hidden
        for l_idx in range(num_layers-1):
            self.layers.append(self.gnn_layer(in_channels=in_channels, out_channels=out_channels, **kwargs))
            self.layers.append(nn.ReLU(inplace=True))
            self.layers.append(nn.Dropout(dp_rate))
            in_channels = c_hidden

        self.layers.append(self.gnn_layer(in_channels=in_channels, out_channels=c_out, **kwargs))

    def forward(self, x, edge_index):
        for l in self.layers:
            if isinstance(l, geom_nn.MessagePassing):
                x = l(x, edge_index) # Add the "edge_index" tensor as additional input for graph layers
            else:
                x = l(x)
        return x

## 2. MLP Model

Inputs description: c_in: Dimension of input features; c_hidden: Dimension of hidden features; c_out: Dimension of the output features; num_layers: Number of hidden layers; dp_rate: Dropout rate to apply throughout the network
            
            
            
            
            

In [8]:
class MLPModel(nn.Module):
    def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
        super(MLPModel, self).__init__()

        self.layers = nn.ModuleList()
        in_channels, out_channels = c_in, c_hidden
        for l_idx in range(num_layers-1):
            self.layers.append(nn.Linear(in_channels, out_channels))
            self.layers.append(nn.ReLU(inplace=True))
            self.layers.append(nn.Dropout(dp_rate))
            in_channels = c_hidden

        self.layers.append(nn.Linear(in_channels, c_out))

    def forward(self, x, *args, **kwargs):
        for layer in self.layers:
            x = layer(x) # x: Input features per node
        return x

## 3. Merge the models into a PyTorch Lightning module and train a node classifier

In [9]:
class Node_GNN(pl.LightningModule):
    def __init__(self, model_name, **model_kwargs):
        super(Node_GNN, self).__init__()
        self.save_hyperparameters()

        if model_name == "MLP":
            self.model = MLPModel(**model_kwargs)
        else:
            self.model = GNNModel(**model_kwargs)
        self.loss_module = nn.CrossEntropyLoss()

    def forward(self, data, mode="train"):
        x, edge_index = data.x, data.edge_index
        x = self.model(x, edge_index)

        if mode == "train":
            mask = data.train_mask
        elif mode == "val":
            mask = data.val_mask
        elif mode == "test":
            mask = data.test_mask
        else:
            raise ValueError(f"Unknown forward mode: {mode}")

        loss = self.loss_module(x[mask], data.y[mask])
        acc = (x[mask].argmax(dim=-1) == data.y[mask]).sum().float() / mask.sum()
        return loss, acc

    def configure_optimizers(self):
        # Use Adam or SGD optimizer
        optimizer = optim.Adam(self.parameters(), lr=0.1)
        # optimizer = optim.SGD(self.parameters(), lr=0.1, momentum=0.9, weight_decay=2e-3)
        return optimizer

    def training_step(self, batch, batch_idx):
        loss, acc = self.forward(batch, mode="train")
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    def validation_step(self, batch, batch_idx):
        _, acc = self.forward(batch, mode="val")
        self.log('val_acc', acc)

    def test_step(self, batch, batch_idx):
        _, acc = self.forward(batch, mode="test")
        self.log('test_acc', acc)

In [10]:
def train_node_classifier(model_name, dataset, **model_kwargs):
    pl.seed_everything(6100)
    node_data_loader = geom_data.DataLoader(dataset, batch_size=1)

    # Create a PyTorch Lightning trainer with the generation callback
    root_dir = os.path.join(CHECKPOINT_PATH, "Node" + model_name)
    os.makedirs(root_dir, exist_ok=True)
    # False enable_progress_bar means that epoch size is 1
    trainer = pl.Trainer(default_root_dir=root_dir, callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")], accelerator="gpu" if str(device).startswith("cuda") else "cpu", devices=1, max_epochs=200, enable_progress_bar=False)

    # NO need of optional logging argument
    trainer.logger._default_hp_metric = None

    # If pretrained model exists, then load it and skip the training process
    pretrained_filename = os.path.join(CHECKPOINT_PATH, f"Node{model_name}.ckpt")
    if os.path.isfile(pretrained_filename):
        print("Found pretrained model, loading...")
        model = Node_GNN.load_from_checkpoint(pretrained_filename)
    else:
        pl.seed_everything()
        model = Node_GNN(model_name=model_name, c_in=dataset.num_node_features, c_out=dataset.num_classes, **model_kwargs)
        trainer.fit(model, node_data_loader, node_data_loader)
        model = Node_GNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # Find better model for the test set
    test_result = trainer.test(model, node_data_loader, verbose=False)
    batch = next(iter(node_data_loader))
    batch = batch.to(model.device)
    _, train_acc = model.forward(batch, mode="train")
    _, val_acc = model.forward(batch, mode="val")
    result = {"train": train_acc, "val": val_acc, "test": test_result[0]['test_acc']}
    return model, result

## 4. Print the results

In [11]:
# Print the results
def print_results(result_dict):
    if "train" in result_dict:
        train_acc = (100.0 * result_dict['train'])
        print(f"Train accuracy: {train_acc:4.2f}%")
    if "val" in result_dict:
        val_acc = (100.0 * result_dict['val'])
        print(f"Validation accuracy: {val_acc:4.2f}%")
    test_acc = (100.0 * result_dict['test'])
    print(f"Test accuracy: {test_acc:4.2f}%")

In [12]:
node_mlp_model, node_mlp_result = train_node_classifier(model_name="MLP", dataset=cora_dataset, c_hidden=16, num_layers=2, dp_rate=0.1)

print_results(node_mlp_result)

INFO:lightning_fabric.utilities.seed:Seed set to 6100
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:lightning_fabric.utilities.seed:Seed set to 6100
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name        | Type             | Params
-------------------------------------------------
0 | model       | MLPModel         | 23.1 K
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
23.1 K    Trainable params
0         Non-trainable params
23.1 K    Total params
0.092     Total estimated model params size (MB)
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connector

Train accuracy: 98.57%
Validation accuracy: 50.20%
Test accuracy: 52.50%


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


In [13]:
node_gnn_model, node_gnn_result = train_node_classifier(model_name="GNN", layer_name="GCN", dataset=cora_dataset, c_hidden=16, num_layers=2, dp_rate=0.1)

print_results(node_gnn_result)

INFO:lightning_fabric.utilities.seed:Seed set to 6100
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:lightning_fabric.utilities.seed:Seed set to 6100
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name        | Type             | Params
-------------------------------------------------
0 | model       | GNNModel         | 23.1 K
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
23.1 K    Trainable params
0         Non-trainable params
23.1 K    Total params
0.092     Total estimated model params size (MB)
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epoc

Train accuracy: 100.00%
Validation accuracy: 74.60%
Test accuracy: 76.20%
