# CS224W Final Project: Tutorial on the Augmentation of Graphs in PyG

### Jerry Chan, Jihee Suh, John So

## Installation and Setup

### Install PyG

In [6]:
import torch
torch_version = str(torch.__version__)
if "2.4.0" not in torch_version:
  !pip install torch==2.4.0 -q
print(torch_version)

2.4.0+cu121


In [1]:
scatter_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
sparse_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
!pip install torch-scatter -f $scatter_src -q
!pip install torch-sparse -f $sparse_src -q
!pip install torch-geometric -q
!pip install ogb -q

NameError: name 'torch_version' is not defined

In [1]:
import os
import random

import numpy as np
from tqdm import tqdm

import torch
from torch_geometric.nn.models import GraphSAGE
from torch_geometric.loader import NeighborLoader
import torch_geometric.transforms as T
from torch_geometric.utils import to_undirected
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator


In [21]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Dataset and Tasks



In [30]:
dataset = PygNodePropPredDataset(name='ogbn-products', root='./products/')
split_idx = dataset.get_idx_split()

# sample test set to speed up
split_idx['test'] = split_idx['test'][:10000]
data = dataset[0]
print(data)

  self.data, self.slices = torch.load(self.processed_paths[0])


Data(num_nodes=2449029, edge_index=[2, 123718280], x=[2449029, 100], y=[2449029, 1])


### Training and Evaluation Utilities

In [31]:
# Model settings
input_dim = dataset[0].x.shape[1]
hidden_dim = 128
num_layers = 2

# Training settings
learning_rate = 0.0001
num_epochs = 20

# Dataloader settings
batch_size = 32
fan_out = 10
dataloader_num_workers = 2

In [32]:
class GraphSAGENodeClassification(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes):
        super(GraphSAGENodeClassification, self).__init__()
        self.graph_sage = GraphSAGE(in_channels = input_dim, hidden_channels = hidden_dim, num_layers=num_layers)
        self.cls_head = torch.nn.Sequential(
            torch.nn.Dropout(0.1),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, num_classes),
        )
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, x, edge_index):
        h = self.graph_sage(x, edge_index)
        return self.cls_head(h)

model = GraphSAGENodeClassification(input_dim, hidden_dim, num_layers, dataset.num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)
model

GraphSAGENodeClassification(
  (graph_sage): GraphSAGE(100, 128, num_layers=2)
  (cls_head): Sequential(
    (0): Dropout(p=0.1, inplace=False)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=47, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
)

In [33]:
train_loader = NeighborLoader(
    data,
    input_nodes=split_idx['train'],
    num_neighbors=[fan_out] * num_layers,
    batch_size=batch_size,
    shuffle=True,
    pin_memory=True,
    num_workers=dataloader_num_workers
)
val_loader = NeighborLoader(
    data,
    input_nodes=split_idx['valid'],
    num_neighbors=[fan_out] * num_layers,
    batch_size=batch_size,
    shuffle=True,
    num_workers=dataloader_num_workers,
)
test_loader = NeighborLoader(
    data,
    input_nodes=split_idx['test'],
    num_neighbors=[fan_out] * num_layers,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0
)

print(f"Initialized Data Loaders with {len(split_idx['train'])} training, {len(split_idx['valid'])} validation, and {len(split_idx['test'])} test nodes.")



Initialized Data Loaders with 196615 training, 39323 validation, and 10000 test nodes.


In [48]:
# training process
def train_one_epoch(model, dataloader, optimizer, transform=None):
    model.train()

    # define states
    total_loss = 0
    total_correct = 0
    num_examples = 0

    for batch in tqdm(dataloader):

        # transform batch if needed
        batch_size = batch.batch_size
        batch = batch.to(device)
        if transform is not None:
          batch = transform(batch)

        # forward pass
        optimizer.zero_grad()
        logits = model(batch.x, batch.edge_index)[:batch_size]

        # backward pass
        labels = batch.y[:batch_size].squeeze(-1)
        loss = model.loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        # log stats
        total_loss += loss.item() * batch_size
        total_correct += logits.argmax(dim=-1).eq(labels).sum().item()
        num_examples += batch_size

    loss = total_loss / num_examples
    acc = total_correct / num_examples
    return loss, acc

# test process
@torch.no_grad()
def test(model, dataloader, transform=None, apply_transform=True):
    model.eval()

    # define states
    total_loss = 0
    total_correct = 0
    num_examples = 0

    for batch in tqdm(dataloader):

        # transform batch if needed
        batch_size = batch.batch_size
        batch = batch.to(device)
        if apply_transform and (transform is not None):
          batch = transform(batch)

        # forward pass
        logits = model(batch.x, batch.edge_index)[:batch_size]
        labels = batch.y[:batch_size].squeeze(-1)
        loss = model.loss_fn(logits, labels)
        
        # log stats
        total_loss += loss.item() * batch_size
        total_correct += logits.argmax(dim=-1).eq(labels).sum().item()
        num_examples += batch_size

    loss = total_loss / num_examples
    acc = total_correct / num_examples
    return loss, acc

In [None]:
best_val_acc = 0
for epoch in range(1, num_epochs + 1):
    print(f'Epoch: {epoch:02d}')

    # training
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer)
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {100.0 * train_acc:.2f}%')

    # validation
    val_loss, val_acc = test(model, val_loader)
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {100.0 * val_acc:.2f}%')

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        print('New best validation accuracy, saving model...')
        torch.save(model.state_dict(), 'best_model.pth')


print(f'Best Validation Accuracy: {100.0 * best_val_acc:.2f}%')

# eval best model
model.load_state_dict(torch.load('best_model.pth'))
test_loss, test_final_acc = test(model, train_loader)
print(f'Test Accuracy: {100.0 * test_final_acc:.2f}%')

Epoch: 01


 13%|█▎        | 788/6145 [00:07<00:49, 107.63it/s]

## Training Data Augmentation

Half-Hop

In [None]:
halfhop = T.HalfHop(alpha=0.5, p=1.0)

In [None]:
def train_halfhop(model, optimizer, dataloader: NeighborLoader, transform=None) -> tuple[torch.Tensor, float]:
    model.train()

    total_loss = total_correct = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        batch = batch.to(device)
        if transform is not None:
          batch = transform(batch)
        out = model(batch.x, batch.edge_index)[~batch.slow_node_mask][:batch.batch_size]
        y = batch.y[:batch.batch_size].squeeze().to(torch.long)
        loss = torch.nn.functional.cross_entropy(out, y)
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y).sum())
    loss = total_loss / len(train_loader)
    approx_acc = total_correct / split_idx['train'].size(0)
    return loss, approx_acc

@torch.no_grad()
def test_halfhop(model, dataloader: NeighborLoader, transform=None) -> float:
    model.eval()

    total_correct = total_examples = 0
    total_loss = 0
    for batch in tqdm(dataloader):
        batch = batch.to(device)
        if transform is not None:
          batch = transform(batch)
        out = model(batch.x, batch.edge_index)[~batch.slow_node_mask]
        pred = out.argmax(dim=-1)
        y = batch.y.view(-1).to(torch.long)

        loss = torch.nn.functional.cross_entropy(out, y)
        total_loss += float(loss)

        total_correct += int((pred == y).sum())
        total_examples += y.size(0)

    return total_loss / len(dataloader), total_correct / total_examples

In [None]:
model.reset_parameters()

times = []
best_val = 0.
num_epochs = 1
for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_halfhop(model, optimizer, train_loader, transform=halfhop)
    val_loss, val_acc = test_halfhop(model, val_loader, transform=halfhop)

    print(f'Epoch {epoch:02d}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc * 100.0:.2f}%',)
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc * 100.0:.2f}%')

    if val_acc > best_val:
        best_val = val_acc

print(f'Best Validation Accuracy: {100.0 * best_val:.2f}%')

print('Testing...')
test_loss, test_final_acc = test_halfhop(model, test_loader, transform=halfhop)
print(f'Test Accuracy: {100.0 * test_final_acc:.2f}%')

100%|██████████| 2842/2842 [00:41<00:00, 68.16it/s]
100%|██████████| 932/932 [00:10<00:00, 86.70it/s] 


Epoch 01, Train Loss: 2.5389, Train Acc: 34.33%
Val Loss: 2.1922, Val Acc: 39.85%
Best Validation Accuracy: 39.85%
Testing...


100%|██████████| 1519/1519 [00:06<00:00, 247.96it/s]

Test Accuracy: 37.04%





### Mask Feature

In [None]:
from torch_geometric.utils import mask_feature

def train_with_mask(model, optimizer, dataloader: NeighborLoader, p = 0.2) -> tuple[torch.Tensor, float]:
    model.train()

    total_loss = total_correct = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        batch = batch.to(device)
        masked_x, feature_mask = mask_feature(batch.x, p)
        out = model(masked_x, batch.edge_index)[:batch.batch_size]
        y = batch.y[:batch.batch_size].squeeze().to(torch.long)
        loss = torch.nn.functional.cross_entropy(out, y)
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y).sum())
    loss = total_loss / len(train_loader)
    approx_acc = total_correct / split_idx['train'].size(0)
    return loss, approx_acc

In [None]:
times = []
best_val = 0.
for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_with_mask(model, optimizer, train_loader)
    val_loss, val_acc = test(model, val_loader)

    print(f'Epoch {epoch:02d}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc * 100.0:.2f}%',)
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc * 100.0:.2f}%')

    if val_acc > best_val:
        best_val = val_acc

print(f'Best Validation Accuracy: {100.0 * best_val:.2f}%')

print('Testing...')
test_loss, test_final_acc = test(test_loader)
print(f'Test Accuracy: {100.0 * test_final_acc:.2f}%')