# Build Graph

In [1]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import numpy as np

In [2]:
# -----------------------------
# CONFIG – adjust paths if needed
# -----------------------------
ROOT = Path.cwd().parent      # moves from /src → GNN/
DATA_DIR = ROOT / "data" / "data_cleaned"
OUT_DIR = ROOT / "outputs"
OUT_DIR.mkdir(exist_ok=True)

# cleaned edge files
CIRC_MIR = DATA_DIR / "circRNA_miRNA_edges.csv"
MIR_DIS  = DATA_DIR / "miRNA_disease_edges.csv"
CIRC_DIS = DATA_DIR / "circRNA_disease_edges.csv"

print("Notebook running from:", Path.cwd())
print("ROOT:", ROOT)
print("DATA_DIR:", DATA_DIR)
print("Files:")
print(" -", CIRC_MIR)
print(" -", MIR_DIS)
print(" -", CIRC_DIS)

Notebook running from: C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\src
ROOT: C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn
DATA_DIR: C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\data\data_cleaned
Files:
 - C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\data\data_cleaned\circRNA_miRNA_edges.csv
 - C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\data\data_cleaned\miRNA_disease_edges.csv
 - C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\data\data_cleaned\circRNA_disease_edges.csv


In [3]:
pd.read_csv(CIRC_MIR).head()


Unnamed: 0,circRNA,miRNA
0,bcrc-3,mir-182-5p
1,cbl.11,mir-6778-5p
2,cdr1as,mir-1270
3,cdr1as,mir-1299
4,cdr1as,mir-7


In [4]:
# -----------------------------
# Load datasets
# -----------------------------
def load_edges():
    cm = pd.read_csv(CIRC_MIR)
    md = pd.read_csv(MIR_DIS)
    cd = pd.read_csv(CIRC_DIS)

    # normalize column names (adjust if your CSV has different header names)
    cm.columns = ["circRNA", "miRNA"]
    md.columns = ["miRNA", "disease"]
    cd.columns = ["circRNA", "disease"]

    return cm, md, cd

In [5]:
cm, md, cd = load_edges()
cm.head(), md.head(), cd.head()


(  circRNA        miRNA
 0  bcrc-3   mir-182-5p
 1  cbl.11  mir-6778-5p
 2  cdr1as     mir-1270
 3  cdr1as     mir-1299
 4  cdr1as        mir-7,
         miRNA                   disease
 0     mir-181  leukemia, myeloid, acute
 1     mir-496  leukemia, myeloid, acute
 2  mir-20a-5p  leukemia, myeloid, acute
 3     mir-203  leukemia, myeloid, acute
 4  mir-135-5p  leukemia, myeloid, acute,
             circRNA                   disease
 0       circ-anapc7  leukemia, myeloid, acute
 1  hsa_circ_0000488  leukemia, myeloid, acute
 2  hsa_circ_0009910  leukemia, myeloid, acute
 3   hsa_circ_100290  leukemia, myeloid, acute
 4          circpan3  leukemia, myeloid, acute)

## Build Graph: Label Encoders

In [6]:
from sklearn.preprocessing import LabelEncoder

le_circ = LabelEncoder()
le_mir  = LabelEncoder()
le_dis  = LabelEncoder()

# Fit encoders
le_circ.fit(pd.concat([cm["circRNA"], cd["circRNA"]], ignore_index=True).astype(str))
le_mir.fit(pd.concat([cm["miRNA"], md["miRNA"]], ignore_index=True).astype(str))
le_dis.fit(pd.concat([md["disease"], cd["disease"]], ignore_index=True).astype(str))

num_circ = len(le_circ.classes_)
num_mir  = len(le_mir.classes_)
num_dis  = len(le_dis.classes_)

num_circ, num_mir, num_dis

(828, 521, 122)

## Encode edges

In [7]:
cm_src = torch.tensor(le_circ.transform(cm["circRNA"].astype(str)), dtype=torch.long)
cm_dst = torch.tensor(le_mir.transform(cm["miRNA"].astype(str)), dtype=torch.long)

md_src = torch.tensor(le_mir.transform(md["miRNA"].astype(str)), dtype=torch.long)
md_dst = torch.tensor(le_dis.transform(md["disease"].astype(str)), dtype=torch.long)

cd_src = torch.tensor(le_circ.transform(cd["circRNA"].astype(str)), dtype=torch.long)
cd_dst = torch.tensor(le_dis.transform(cd["disease"].astype(str)), dtype=torch.long)

cm_src[:10], cm_dst[:10]

(tensor([ 0,  1,  3,  3,  3,  3, 74, 75, 79, 80]),
 tensor([164, 471,  90, 102, 478, 479, 192, 102, 139, 191]))

In [8]:
#Create HeteroData
data = HeteroData()

#Add biological edges first
data["circRNA", "interacts", "miRNA"].edge_index = torch.stack([cm_src, cm_dst])
data["miRNA", "interacts", "disease"].edge_index = torch.stack([md_src, md_dst])
data["circRNA", "associated", "disease"].edge_index = torch.stack([cd_src, cd_dst])

import networkx as nx

# -----------------------------
# Build NetworkX graph for betweenness
# -----------------------------
G = nx.Graph()
G.add_nodes_from(range(num_circ + num_mir + num_dis))

# Offset indices to avoid node-id collision
circ_offset = 0
mir_offset  = num_circ
dis_offset  = num_circ + num_mir

# circRNA–miRNA
G.add_edges_from(
    [(circ_offset + s, mir_offset + d) for s, d in zip(cm_src.tolist(), cm_dst.tolist())]
)

# miRNA–disease
G.add_edges_from(
    [(mir_offset + s, dis_offset + d) for s, d in zip(md_src.tolist(), md_dst.tolist())]
)

# circRNA–disease
G.add_edges_from(
    [(circ_offset + s, dis_offset + d) for s, d in zip(cd_src.tolist(), cd_dst.tolist())]
)
# -----------------------------
# Betweenness centrality
# -----------------------------
bc = nx.betweenness_centrality(G, normalized=True)
circ_bc = torch.tensor(
    [bc[circ_offset + i] for i in range(num_circ)],
    dtype=torch.float32
).unsqueeze(1)

mir_bc = torch.tensor(
    [bc[mir_offset + i] for i in range(num_mir)],
    dtype=torch.float32
).unsqueeze(1)

dis_bc = torch.tensor(
    [bc[dis_offset + i] for i in range(num_dis)],
    dtype=torch.float32
).unsqueeze(1)

def normalize(x):
    return (x - x.mean()) / (x.std() + 1e-8)

circ_bc = normalize(circ_bc)
mir_bc  = normalize(mir_bc)
dis_bc  = normalize(dis_bc)



#Add node features (degree + log-degree ONLY)
# Degree features
circ_deg = torch.zeros((num_circ, 1))
mir_deg  = torch.zeros((num_mir, 1))
dis_deg  = torch.zeros((num_dis, 1))

# circRNA degrees
for i in cm_src: circ_deg[i] += 1
for i in cd_src: circ_deg[i] += 1

# miRNA degrees
for i in cm_dst: mir_deg[i] += 1
for i in md_src: mir_deg[i] += 1

# disease degrees
for i in md_dst: dis_deg[i] += 1
for i in cd_dst: dis_deg[i] += 1

# -----------------------------
# Node-type one-hot encodings
# -----------------------------
circ_type = torch.tensor([[1, 0, 0]]).repeat(num_circ, 1)
mir_type  = torch.tensor([[0, 1, 0]]).repeat(num_mir, 1)
dis_type  = torch.tensor([[0, 0, 1]]).repeat(num_dis, 1)


# -----------------------------
# Final node features
# [degree | log-degree | betweenness | node-type]
# -----------------------------
data["circRNA"].x = torch.cat(
    [circ_deg, torch.log1p(circ_deg), circ_bc, circ_type], dim=1
).float()

data["miRNA"].x = torch.cat(
    [mir_deg, torch.log1p(mir_deg), mir_bc, mir_type], dim=1
).float()

data["disease"].x = torch.cat(
    [dis_deg, torch.log1p(dis_deg), dis_bc, dis_type], dim=1
).float()

print("circRNA feature dim:", data["circRNA"].x.shape)
print("miRNA feature dim:", data["miRNA"].x.shape)
print("disease feature dim:", data["disease"].x.shape)


#Build interaction matrices explicitly
# circRNA × disease
A = torch.zeros((num_circ, num_dis))
A[cd_src, cd_dst] = 1

# miRNA × disease
B = torch.zeros((num_mir, num_dis))
B[md_src, md_dst] = 1

#Add GIP similarity as EDGES (not features)
gamma = 1 / A.shape[1]
K_circ = torch.exp(-gamma * torch.cdist(A, A) ** 2)

src, dst = torch.where(K_circ > 0.5)

data["circRNA", "gip_sim", "circRNA"].edge_index = torch.stack([src, dst])
data["circRNA", "gip_sim", "circRNA"].edge_weight = K_circ[src, dst]

gamma_m = 1 / B.shape[1]
K_mir = torch.exp(-gamma_m * torch.cdist(B, B) ** 2)

src, dst = torch.where(K_mir > 0.5)

data["miRNA", "gip_sim", "miRNA"].edge_index = torch.stack([src, dst])
data["miRNA", "gip_sim", "miRNA"].edge_weight = K_mir[src, dst]

#Make graph undirected (ONLY NOW)
from torch_geometric.transforms import ToUndirected
data = ToUndirected()(data)

#Save
torch.save(data, OUT_DIR / "data.pt")
torch.save(
    {"circRNA": le_circ, "miRNA": le_mir, "disease": le_dis},
    OUT_DIR / "label_encoders.pt"
)
print("✅ Graph saved to:", OUT_DIR / "data.pt")


circRNA feature dim: torch.Size([828, 6])
miRNA feature dim: torch.Size([521, 6])
disease feature dim: torch.Size([122, 6])
✅ Graph saved to: C:\Users\ayish\OneDrive\Documents\circRNA-disease-gnn\outputs\data.pt


## Create HeteroData object based on features

### Adding Edge Indices to HeteroData object with nodes already added 