# CS224W Final Project: Tutorial on the Augmentation of Graphs in PyG

### Jerry Chan, Jihee Suh, John So

## Installation and Setup

### Install PyG

In [1]:
import torch
torch_version = str(torch.__version__)
if "2.4.0" not in torch_version:
  !pip install torch==2.4.0

Collecting torch==2.4.0
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.4.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.4.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.4.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-many

In [None]:
print(torch_version)

2.4.0+cu121


In [None]:
scatter_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
sparse_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
!pip install torch-scatter -f $scatter_src
!pip install torch-sparse -f $sparse_src
!pip install torch-geometric
!pip install ogb

Looking in links: https://pytorch-geometric.com/whl/torch-2.4.0+cu121.html
Looking in links: https://pytorch-geometric.com/whl/torch-2.4.0+cu121.html


In [None]:
import os
import random

import numpy as np
from tqdm import tqdm

import torch
from torch_geometric.nn.models import GraphSAGE
from torch_geometric.loader import NeighborLoader
import torch_geometric.transforms as T
from torch_geometric.utils import to_undirected
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator


In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Dataset and Tasks



In [None]:
dataset = PygNodePropPredDataset(name='ogbn-products', root='./products/')
print(dataset, flush=True)
data = dataset[0]
print(data, flush=True)

split_idx = dataset.get_idx_split()
graph = dataset[0]

#evaluator = Evaluator(name='ogbn-products')

  self.data, self.slices = torch.load(self.processed_paths[0])


PygNodePropPredDataset()
Data(num_nodes=2449029, edge_index=[2, 123718280], x=[2449029, 100], y=[2449029, 1])


### Training and Evaluation Utilities

In [None]:
### From https://github.com/pyg-team/pytorch_geometric/blob/master/examples/ogbn_train.py

# training process
def train(model, optimizer, dataloader: NeighborLoader, transform=None) -> tuple[torch.Tensor, float]:
    model.train()

    total_loss = total_correct = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        batch = batch.to(device)
        if transform is not None:
          batch = transform(batch)
        out = model(batch.x, batch.edge_index)[:batch.batch_size]
        y = batch.y[:batch.batch_size].squeeze().to(torch.long)
        loss = torch.nn.functional.cross_entropy(out, y)
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y).sum())
    loss = total_loss / len(train_loader)
    approx_acc = total_correct / split_idx['train'].size(0)
    return loss, approx_acc


@torch.no_grad()
def test(model, dataloader: NeighborLoader, transform=None) -> float:
    model.eval()

    total_correct = total_examples = 0
    total_loss = 0
    for batch in tqdm(dataloader):
        batch = batch.to(device)
        if transform is not None:
          batch = transform(batch)
        out = model(batch.x, batch.edge_index)
        pred = out.argmax(dim=-1)
        y = batch.y.view(-1).to(torch.long)

        loss = torch.nn.functional.cross_entropy(out, y)
        total_loss += float(loss)

        total_correct += int((pred == y).sum())
        total_examples += y.size(0)

    return total_loss / len(dataloader), total_correct / total_examples


In [None]:
input_dim = dataset[0].x.shape[1]
hidden_dim = 128
learning_rate = 0.0001
num_epochs = 20
batch_size = 32
num_layers = 2

fan_out = 10
num_workers = 2

loss_fn = torch.nn.CrossEntropyLoss()
model = GraphSAGE(in_channels=input_dim, hidden_channels=hidden_dim, num_layers=num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.to(device)
model

GraphSAGE(100, 128, num_layers=2)

In [None]:
train_loader = NeighborLoader(
    data,
    input_nodes=split_idx['test'],
    num_neighbors=[fan_out] * num_layers,
    batch_size=batch_size,
    shuffle=True,
    pin_memory=True,
    num_workers=num_workers
)
# val_loader = NeighborLoader(
#     data,
#     input_nodes=split_idx['test'],
#     num_neighbors=[fan_out] * num_layers,
#     batch_size=batch_size,
#     shuffle=True,
#     num_workers=num_workers,
# )
# test_loader = NeighborLoader(
#     data,
#     input_nodes=split_idx['test'],
#     num_neighbors=[fan_out] * num_layers,
#     batch_size=batch_size,
#     shuffle=True,
#     num_workers=0
# )



In [None]:
times = []
best_val = 0.
for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train(model, optimizer, train_loader)
    val_loss, val_acc = test(model, train_loader)

    print(f'Epoch {epoch:02d}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc * 100.0:.2f}%',)
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc * 100.0:.2f}%')

    if val_acc > best_val:
        best_val = val_acc

print(f'Best Validation Accuracy: {100.0 * best_val:.2f}%')

print('Testing...')
test_loss, test_final_acc = test(train_loader)
print(f'Test Accuracy: {100.0 * test_final_acc:.2f}%')

  5%|▍         | 3415/69160 [01:04<20:48, 52.65it/s]Exception in thread Thread-13 (_pin_memory_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 55, in _pin_memory_loop
    do_one_step()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 32, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
  File "/usr/local/lib/python3.10/dist-packages/torch/multiprocessing/reductions.py", line 496, in rebuild_storage_fd

    fd = df.detach()
  File "/usr/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_c

KeyboardInterrupt: 

## Training Data Augmentation

Half-Hop

In [None]:
halfhop = T.HalfHop(alpha=0.5, p=1.0)

In [None]:
def train_halfhop(model, optimizer, dataloader: NeighborLoader, transform=None) -> tuple[torch.Tensor, float]:
    model.train()

    total_loss = total_correct = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        batch = batch.to(device)
        if transform is not None:
          batch = transform(batch)
        out = model(batch.x, batch.edge_index)[~batch.slow_node_mask][:batch.batch_size]
        y = batch.y[:batch.batch_size].squeeze().to(torch.long)
        loss = torch.nn.functional.cross_entropy(out, y)
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y).sum())
    loss = total_loss / len(train_loader)
    approx_acc = total_correct / split_idx['train'].size(0)
    return loss, approx_acc

@torch.no_grad()
def test_halfhop(model, dataloader: NeighborLoader, transform=None) -> float:
    model.eval()

    total_correct = total_examples = 0
    total_loss = 0
    for batch in tqdm(dataloader):
        batch = batch.to(device)
        if transform is not None:
          batch = transform(batch)
        out = model(batch.x, batch.edge_index)[~batch.slow_node_mask]
        pred = out.argmax(dim=-1)
        y = batch.y.view(-1).to(torch.long)

        loss = torch.nn.functional.cross_entropy(out, y)
        total_loss += float(loss)

        total_correct += int((pred == y).sum())
        total_examples += y.size(0)

    return total_loss / len(dataloader), total_correct / total_examples

In [None]:
model.reset_parameters()

times = []
best_val = 0.
num_epochs = 1
for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_halfhop(model, optimizer, train_loader, transform=halfhop)
    val_loss, val_acc = test_halfhop(model, val_loader, transform=halfhop)

    print(f'Epoch {epoch:02d}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc * 100.0:.2f}%',)
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc * 100.0:.2f}%')

    if val_acc > best_val:
        best_val = val_acc

print(f'Best Validation Accuracy: {100.0 * best_val:.2f}%')

print('Testing...')
test_loss, test_final_acc = test_halfhop(model, test_loader, transform=halfhop)
print(f'Test Accuracy: {100.0 * test_final_acc:.2f}%')

100%|██████████| 2842/2842 [00:41<00:00, 68.16it/s]
100%|██████████| 932/932 [00:10<00:00, 86.70it/s] 


Epoch 01, Train Loss: 2.5389, Train Acc: 34.33%
Val Loss: 2.1922, Val Acc: 39.85%
Best Validation Accuracy: 39.85%
Testing...


100%|██████████| 1519/1519 [00:06<00:00, 247.96it/s]

Test Accuracy: 37.04%





### Mask Feature

In [None]:
from torch_geometric.utils import mask_feature

def train_with_mask(model, optimizer, dataloader: NeighborLoader, p = 0.2) -> tuple[torch.Tensor, float]:
    model.train()

    total_loss = total_correct = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        batch = batch.to(device)
        masked_x, feature_mask = mask_feature(batch.x, p)
        out = model(masked_x, batch.edge_index)[:batch.batch_size]
        y = batch.y[:batch.batch_size].squeeze().to(torch.long)
        loss = torch.nn.functional.cross_entropy(out, y)
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y).sum())
    loss = total_loss / len(train_loader)
    approx_acc = total_correct / split_idx['train'].size(0)
    return loss, approx_acc

In [None]:
times = []
best_val = 0.
for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_with_mask(model, optimizer, train_loader)
    val_loss, val_acc = test(model, val_loader)

    print(f'Epoch {epoch:02d}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc * 100.0:.2f}%',)
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc * 100.0:.2f}%')

    if val_acc > best_val:
        best_val = val_acc

print(f'Best Validation Accuracy: {100.0 * best_val:.2f}%')

print('Testing...')
test_loss, test_final_acc = test(test_loader)
print(f'Test Accuracy: {100.0 * test_final_acc:.2f}%')