In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 2.1 MB/s eta 0:00:04
   --- ------------------------------------ 0.8/8.7 MB 1.8 MB/s eta 0:00:05
   ---- ----------------------------------- 1.0/8.7 MB 1.7 MB/s eta 0:00:05
   ------ --------------------------------- 1.3/8.7 MB 1.5 MB/s eta 0:00:06
   ------- -------------------------------- 1.6/8.7 MB 1.3 MB/s eta 0:00:06
   ------- -------------------------------- 1.6/8.7 MB 1.3 MB/s eta 0:00:06
   -------- -

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load positive and negative edges
pos_edges = pd.read_csv("data_cleaned/circRNA_miRNA_edges.csv")
neg_edges = pd.read_csv("data_cleaned/circRNA_miRNA_neg_edges.csv")

# Label them: 1 for positive, 0 for negative
pos_edges['label'] = 1
neg_edges['label'] = 0

# Combine
all_edges = pd.concat([pos_edges, neg_edges], ignore_index=True)

# Shuffle
all_edges = all_edges.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train (80%), val (10%), test (10%)
train_edges, temp_edges = train_test_split(all_edges, test_size=0.2, random_state=42, stratify=all_edges['label'])
val_edges, test_edges = train_test_split(temp_edges, test_size=0.5, random_state=42, stratify=temp_edges['label'])

# Quick check
print("Train edges:", train_edges.shape[0])
print("Validation edges:", val_edges.shape[0])
print("Test edges:", test_edges.shape[0])

# Save splits
train_edges.to_csv("data_cleaned/circRNA_miRNA_train.csv", index=False)
val_edges.to_csv("data_cleaned/circRNA_miRNA_val.csv", index=False)
test_edges.to_csv("data_cleaned/circRNA_miRNA_test.csv", index=False)


Train edges: 1433
Validation edges: 179
Test edges: 180


In [8]:
train_nodes = set(train_edges['circRNA']).union(set(train_edges['miRNA']))
val_test_nodes = set(val_edges['circRNA']).union(set(val_edges['miRNA'])).union(
                 set(test_edges['circRNA'])).union(set(test_edges['miRNA']))

missing_nodes = val_test_nodes - train_nodes
print("Nodes in val/test not in training:", missing_nodes)


Nodes in val/test not in training: {'mir-204-5p', 'mir-101a-3p', 'hsa-circrna11783-2', 'hsa_circ_0012919', 'mir-196a-5p', 'mir-5095', 'hsa_circ_000839', 'hsa_circ_0081162', 'circrna_5692', 'hsa_circrna_008068', 'mir-888', 'mir-875-3p', 'hsa_circ_0000175', 'mir-3186-3p', 'hsa_circ_0058092', 'hsa_circ_0072391', 'circrna_0005941', 'mir-4269', 'mir-196b', 'hsa_circ_0082081', 'circrna-0044073', 'mir-362-3p', 'hsa_circ_0000594', 'circeif4g2', 'hsa_circ_0000515', 'mir-92', 'mir-873-3p', 'hsa_circ_0000519', 'hsa_circ_0005986', 'hsa_circ_0042666', 'mir-34c', 'circrbms3', 'hsa_circrna_101764', 'cmras', 'hsa_circ_0000378', 'mir-1178-3p', 'circyap1', 'mir-6845-5p', 'mir-4713-5p', 'circ-fam53b', 'mir-466i-5p', 'hsa_circ_0032891', 'circcct3', 'hsa_circrna_103047', 'hsa_circ_0001564', 'circtlk1', 'hsa_circrna_009024', 'circrna_0084043', 'circ-ptprz1', 'circmyo10', 'hsa_circ_0054345', 'mir-6079', 'hsa_circ_0041946', 'hsa_circrna_404457', 'hsa_circ_0008797', 'mir-29c-3p', 'circtada2a-e6', 'hsa_circrna_

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load positive and negative edges
pos_edges = pd.read_csv("data_cleaned/miRNA_disease_edges.csv")
neg_edges = pd.read_csv("data_cleaned/miRNA_disease_neg_edges.csv")

# Label them: 1 for positive, 0 for negative
pos_edges['label'] = 1
neg_edges['label'] = 0

# Combine
all_edges = pd.concat([pos_edges, neg_edges], ignore_index=True)

# Shuffle
all_edges = all_edges.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train (80%), val (10%), test (10%)
train_edges, temp_edges = train_test_split(all_edges, test_size=0.2, random_state=42, stratify=all_edges['label'])
val_edges, test_edges = train_test_split(temp_edges, test_size=0.5, random_state=42, stratify=temp_edges['label'])

# Quick check
print("Train edges:", train_edges.shape[0])
print("Validation edges:", val_edges.shape[0])
print("Test edges:", test_edges.shape[0])

# Save splits
train_edges.to_csv("data_cleaned/miRNA_disease_train.csv", index=False)
val_edges.to_csv("data_cleaned/miRNA_disease_val.csv", index=False)
test_edges.to_csv("data_cleaned/miRNA_disease_test.csv", index=False)


Train edges: 1324
Validation edges: 166
Test edges: 166


In [10]:
train_nodes = set(train_edges['miRNA']).union(set(train_edges['disease']))
val_test_nodes = set(val_edges['miRNA']).union(set(val_edges['disease'])).union(
                 set(test_edges['miRNA'])).union(set(test_edges['disease']))

missing_nodes = val_test_nodes - train_nodes
print("Nodes in val/test not in training:", missing_nodes)


Nodes in val/test not in training: {'mir-1252', 'mir-5692', 'mir-4524a-5p', 'mir-497-5p', 'mir-200c-3p', 'mir-4753', 'mir-875-3p', 'mir-34a-3p', 'mir-29b-1-5p', 'mir-612', 'mir-8075', 'mir-647', 'mir-18a', 'mir-2113', 'mir-181d', 'mir-188-5p', 'mir-411-5p', 'mir-208b-5p', 'mir-193a-5p', 'mir-411', 'mir-325', 'mir-224-5p', 'mir-629', 'mir-496', 'mir-205', 'mir-103a', 'mir-3666', 'mir-370', 'mir-526c', 'mir-20a-5p', 'mir-765', 'mir-216a-3p', 'mir-873-3p', 'mir-1229', 'mir-106a-3p', 'mir-148b-3p', 'mir-378a'}
