In [4]:
# main.ipynb

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch_geometric.loader import DataLoader, NeighborSampler
from torch_geometric.datasets import Planetoid, Reddit
from gnnsyn.utils.gnn_config_parser import parse_gnn_config
from gnnsyn.train.trainer import (
    train_one_epoch_full, eval_model_full,
    train_one_epoch_ns, eval_model_ns
)
import time

#############################################
# 0) 环境 & Debug 信息
#############################################
print("PyTorch version:", torch.__version__)
print("CUDA available?", torch.cuda.is_available())

if torch.cuda.is_available():
    device = 'cuda'
    print("Current device index:", torch.cuda.current_device())
    # 打开CUDNN benchmark可能加速卷积(对GNN帮助有限，但可尝试)
    cudnn.benchmark = True
else:
    device = 'cpu'
    print("Use CPU only")

#############################################
# 1) 选择数据集 & Config
#############################################
# (a) GCN + Cora => small dataset, GPU加速效果有限
mp_config_file = "/notebooks/gnnsyn/config/mp_configs/GCN.json"
task_config_file = "/notebooks/gnnsyn/config/task_configs/cora_node.json"
dataset_choice = "cora"

# (b) GCN + Reddit => 大数据, 看是否能提升GPU利用
# mp_config_file = "/notebooks/gnnsyn/config/mp_configs/GCN.json"
# task_config_file = "/notebooks/gnnsyn/config/task_configs/reddit_node.json"
# dataset_choice = "reddit"

model, merged_conf = parse_gnn_config(mp_config_file, task_config_file)
learning_conf = merged_conf["learning_config"]
lr = learning_conf["lr"]
epochs = learning_conf["epochs"]
batch_size = learning_conf["batch_size"]

print("use_ns =", merged_conf["use_ns"], "batch_size =", batch_size)

#############################################
# 2) 加载数据
#############################################
start_loading = time.time()
if dataset_choice == "cora":
    dataset = Planetoid(root='./data/cora', name='Cora')
    data = dataset[0]
elif dataset_choice == "reddit":
    dataset = Reddit(root='./data/Reddit')
    data = dataset[0]
else:
    raise ValueError(f"Unknown dataset_choice: {dataset_choice}")
end_loading = time.time()
print(f"Data loading took {end_loading - start_loading:.2f}s")

model.to(device)

#############################################
# 3) Dataloader / Sampler
#############################################
use_ns = merged_conf["use_ns"]
sizes = merged_conf.get("neighbor_sizes", [25, 10])  # if needed

# 为了减少CPU等待 & 提高DataLoader性能:
num_workers = 4      # 视资源可调
pin_memory = True    # 如果主机是CUDA => pin_memory可以加速H2D拷贝

if not use_ns:
    # full-batch
    loader = DataLoader(dataset, 
                        batch_size=batch_size, 
                        shuffle=False,
                        num_workers=num_workers,
                        pin_memory=pin_memory)
else:
    # neighbor sampling
    train_idx = data.train_mask.nonzero(as_tuple=True)[0]
    val_idx = data.val_mask.nonzero(as_tuple=True)[0]

    train_loader = NeighborSampler(
        data.edge_index, 
        node_idx=train_idx,
        sizes=sizes,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )
    val_loader = NeighborSampler(
        data.edge_index,
        node_idx=val_idx,
        sizes=sizes,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers
    )
    x_all = data.x
    y_all = data.y

#############################################
# 4) 优化器 + 训练
#############################################
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = F.cross_entropy

print("=== Start Training ===")
for epoch in range(1, epochs+1):
    start_t = time.time()

    # Train
    if not use_ns:
        train_loss = train_one_epoch_full(model, loader, criterion, optimizer, device)
    else:
        train_loss = train_one_epoch_ns(model, x_all, y_all, train_loader, optimizer, device)

    # 每 5 个 epoch 或 epoch=1 做一次 eval, 减少耗时
    if epoch == 1 or epoch % 5 == 0:
        if not use_ns:
            val_loss, val_acc = eval_model_full(model, loader, criterion, device)
        else:
            val_loss, val_acc = eval_model_ns(model, x_all, y_all, val_loader, device)
        end_t = time.time()
        print(f"[Epoch {epoch}] train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, "
              f"val_acc={val_acc:.4f}, took {end_t - start_t:.2f}s")
    else:
        end_t = time.time()
        print(f"[Epoch {epoch}] train_loss={train_loss:.4f}, took {end_t - start_t:.2f}s")

print("=== Training Done ===")


PyTorch version: 2.1.1+cu121
CUDA available? True
Current device index: 0
use_ns = False batch_size = 1


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...


Data loading took 1.57s
=== Start Training ===


Done!


[Epoch 1] train_loss=2.0395, val_loss=1.9505, val_acc=0.1496, took 2.46s
[Epoch 2] train_loss=1.5342, took 1.21s
[Epoch 3] train_loss=1.1106, took 1.15s
[Epoch 4] train_loss=0.8390, took 1.12s
[Epoch 5] train_loss=0.6627, val_loss=1.6397, val_acc=0.7271, took 2.41s
[Epoch 6] train_loss=0.5394, took 1.17s
[Epoch 7] train_loss=0.4556, took 1.19s
[Epoch 8] train_loss=0.3914, took 1.17s
[Epoch 9] train_loss=0.3388, took 1.24s
[Epoch 10] train_loss=0.3072, val_loss=0.7022, val_acc=0.9073, took 2.67s
[Epoch 11] train_loss=0.2692, took 1.37s
[Epoch 12] train_loss=0.2429, took 1.12s
[Epoch 13] train_loss=0.2297, took 1.29s
[Epoch 14] train_loss=0.2058, took 1.19s
[Epoch 15] train_loss=0.2037, val_loss=0.2492, val_acc=0.9398, took 2.30s
[Epoch 16] train_loss=0.1815, took 1.12s
[Epoch 17] train_loss=0.1643, took 1.20s
[Epoch 18] train_loss=0.1623, took 1.19s
[Epoch 19] train_loss=0.1592, took 1.30s
[Epoch 20] train_loss=0.1443, val_loss=0.1351, val_acc=0.9597, took 2.16s
[Epoch 21] train_loss=0.

In [None]:
import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Version:", torch.version.cuda)


/notebooks/gnnsyn
