In [11]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score

from models_sage import HeteroGraphSAGE
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE


'cpu'

In [12]:
# Reproducibility control
import torch, random, numpy as np

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [13]:
SEEDS = [42, 43, 44, 45, 46]


In [14]:
# Notebook-safe paths
ROOT = Path.cwd().parent

DATA_DIR = ROOT / "data" / "data_cleaned"
GRAPH_PATH = ROOT / "outputs" / "data.pt"
OUT_DIR = ROOT / "outputs"
OUT_DIR.mkdir(exist_ok=True)

print("ROOT:", ROOT)
print("DATA_DIR:", DATA_DIR)
print("GRAPH_PATH:", GRAPH_PATH)


ROOT: C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn
DATA_DIR: C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\data\data_cleaned
GRAPH_PATH: C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\outputs\data.pt


### Load node preserving labelled splits

In [15]:
def load_split(name, le_circ, le_dis):
    df = pd.read_csv(DATA_DIR / name)

    # Encode node names → integer IDs
    circ_ids = le_circ.transform(df["circRNA"].astype(str))
    dis_ids  = le_dis.transform(df["disease"].astype(str))

    # Build edge index: [2, num_edges]
    edges = torch.from_numpy(np.vstack([circ_ids, dis_ids])).long()

    # Labels (already numeric)
    labels = torch.tensor(df["label"].values, dtype=torch.float)

    return edges, labels


In [16]:
encoders = torch.load(
    OUT_DIR / "label_encoders.pt",
    weights_only=False
)

le_circ = encoders["circRNA"]
le_dis  = encoders["disease"]


train_edges, train_labels = load_split("circRNA_disease_train.csv", le_circ, le_dis)
val_edges, val_labels     = load_split("circRNA_disease_val.csv", le_circ, le_dis)
test_edges, test_labels   = load_split("circRNA_disease_test.csv", le_circ, le_dis)

train_edges.shape, train_labels.shape


(torch.Size([2, 929]), torch.Size([929]))

### Move Splits to Device

In [17]:
train_edges, train_labels = train_edges.to(DEVICE), train_labels.to(DEVICE)
val_edges, val_labels     = val_edges.to(DEVICE), val_labels.to(DEVICE)
test_edges, test_labels   = test_edges.to(DEVICE), test_labels.to(DEVICE)


### Load HeteroGraph

In [18]:
print("Loading heterogeneous graph...")
data = torch.load(
    GRAPH_PATH,
    map_location=DEVICE,
    weights_only=False
)

data


Loading heterogeneous graph...


HeteroData(
  circRNA={ x=[828, 6] },
  miRNA={ x=[521, 6] },
  disease={ x=[122, 6] },
  (circRNA, interacts, miRNA)={ edge_index=[2, 896] },
  (miRNA, interacts, disease)={ edge_index=[2, 828] },
  (circRNA, associated, disease)={ edge_index=[2, 985] },
  (circRNA, gip_sim, circRNA)={
    edge_index=[2, 685584],
    edge_weight=[685584],
  },
  (miRNA, gip_sim, miRNA)={
    edge_index=[2, 271441],
    edge_weight=[271441],
  },
  (miRNA, rev_interacts, circRNA)={ edge_index=[2, 896] },
  (disease, rev_interacts, miRNA)={ edge_index=[2, 828] },
  (disease, rev_associated, circRNA)={ edge_index=[2, 985] }
)

### Initialize GraphSAGE model

In [19]:
results = []

for seed in SEEDS:
    print(f"\n==============================")
    print(f"Running experiment with SEED = {seed}")
    print(f"==============================")

    set_seed(seed)

    # ---- model must be re-created AFTER seed ----
    model = HeteroGraphSAGE(
        in_channels=data["circRNA"].x.size(1),
        hidden_channels=64,
        out_channels=64,
        dropout=0.2
    ).to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.BCEWithLogitsLoss()

    best_val_aupr = 0.0
    best_val_loss = None


    # -------- Training loop --------
    for epoch in range(1, 51):
        model.train()
        optimizer.zero_grad()

        emb = model(data.x_dict, data.edge_index_dict)
        circ_emb, dis_emb = emb["circRNA"], emb["disease"]

        logits = (circ_emb[train_edges[0]] * dis_emb[train_edges[1]]).sum(dim=1)
        loss = loss_fn(logits, train_labels)

        loss.backward()
        optimizer.step()

        # -------- Validation --------
        model.eval()
        with torch.no_grad():
            emb = model(data.x_dict, data.edge_index_dict)
            circ_emb, dis_emb = emb["circRNA"], emb["disease"]

            val_logits = (circ_emb[val_edges[0]] * dis_emb[val_edges[1]]).sum(dim=1)
            val_scores = torch.sigmoid(val_logits).cpu().numpy()
            val_true   = val_labels.cpu().numpy()

            auc = roc_auc_score(val_true, val_scores)
            aupr = average_precision_score(val_true, val_scores)
        print(
        f"Epoch {epoch:03d} | "
        f"Loss {loss.item():.4f} | "
        f"Val AUC {auc:.4f} | "
        f"Val AUPR {aupr:.4f}"
        )

        if aupr > best_val_aupr:
            best_val_aupr = aupr
            best_val_loss = loss.item()
            torch.save(
                model.state_dict(),
                OUT_DIR / f"sage_best_model_seed{seed}.pth"
            )
            print("   → Saved best model")

    # -------- Test evaluation --------
    model.load_state_dict(
        torch.load(OUT_DIR / f"sage_best_model_seed{seed}.pth", map_location=DEVICE)
    )
    model.eval()

    with torch.no_grad():
        emb = model(data.x_dict, data.edge_index_dict)
        circ_emb, dis_emb = emb["circRNA"], emb["disease"]

        test_logits = (circ_emb[test_edges[0]] * dis_emb[test_edges[1]]).sum(dim=1)
        test_scores = torch.sigmoid(test_logits).cpu().numpy()
        test_true   = test_labels.cpu().numpy()

        test_auc  = roc_auc_score(test_true, test_scores)
        test_aupr = average_precision_score(test_true, test_scores)

    print(f"SEED {seed} | Test AUC {test_auc:.4f} | Test AUPR {test_aupr:.4f}")

    results.append((seed, best_val_loss, test_auc, test_aupr))



Running experiment with SEED = 42
3Layer
3Layer
Epoch 001 | Loss 0.6651 | Val AUC 0.6636 | Val AUPR 0.1530
   → Saved best model
3Layer
3Layer
Epoch 002 | Loss 0.5648 | Val AUC 0.7998 | Val AUPR 0.2544
   → Saved best model
3Layer
3Layer
Epoch 003 | Loss 0.5086 | Val AUC 0.8381 | Val AUPR 0.3092
   → Saved best model
3Layer
3Layer
Epoch 004 | Loss 0.4651 | Val AUC 0.8661 | Val AUPR 0.3736
   → Saved best model
3Layer
3Layer
Epoch 005 | Loss 0.4416 | Val AUC 0.8860 | Val AUPR 0.4725
   → Saved best model
3Layer
3Layer
Epoch 006 | Loss 0.4283 | Val AUC 0.9002 | Val AUPR 0.5380
   → Saved best model
3Layer
3Layer
Epoch 007 | Loss 0.4220 | Val AUC 0.9064 | Val AUPR 0.5623
   → Saved best model
3Layer
3Layer
Epoch 008 | Loss 0.4169 | Val AUC 0.9075 | Val AUPR 0.5406
3Layer
3Layer
Epoch 009 | Loss 0.4144 | Val AUC 0.9073 | Val AUPR 0.5254
3Layer
3Layer
Epoch 010 | Loss 0.4115 | Val AUC 0.9072 | Val AUPR 0.5192
3Layer
3Layer
Epoch 011 | Loss 0.4088 | Val AUC 0.9066 | Val AUPR 0.5157
3Layer
3

In [20]:
results_df = pd.DataFrame(
    results,
    columns=["seed", "best_val_loss", "auc", "aupr"]
)

print("\n===== FINAL RESULTS =====")
print(results_df)
print("\nMean ± Std")
print(
    results_df[["best_val_loss", "auc", "aupr"]]
    .agg(["mean", "std"])
)



===== FINAL RESULTS =====
   seed  best_val_loss       auc      aupr
0    42       0.385279  0.921917  0.570025
1    43       0.396583  0.924483  0.616280
2    44       0.401772  0.929152  0.641099
3    45       0.385056  0.933215  0.596122
4    46       0.386855  0.916981  0.575985

Mean ± Std
      best_val_loss       auc      aupr
mean       0.391109  0.925150  0.599902
std        0.007622  0.006298  0.029344
