In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

def node_preserving_split(edges, label_col='label', train_frac=0.8, val_frac=0.1, test_frac=0.1):
    # Ensure all nodes appear in training
    all_nodes = set(edges['circRNA']).union(set(edges['disease']))
    
    # Group by nodes
    edges['used'] = False
    train_edges = []

    # Add one edge per node to training
    for node in all_nodes:
        idx = edges[((edges['circRNA'] == node) | (edges['disease'] == node)) & (~edges['used'])].index
        if len(idx) > 0:
            edges.loc[idx[0], 'used'] = True
            train_edges.append(edges.loc[idx[0]])

    train_edges = pd.DataFrame(train_edges)
    
    # Remaining edges
    remaining_edges = edges[~edges['used']].drop(columns='used')
    
    # Split remaining edges
    val_edges, test_edges = train_test_split(
        remaining_edges, test_size=test_frac/(val_frac+test_frac), random_state=42, stratify=remaining_edges[label_col]
    )
    
    train_edges = pd.concat([train_edges, remaining_edges.drop(val_edges.index).drop(test_edges.index)])
    
    return train_edges, val_edges, test_edges

# Example usage for circRNA-disease
pos_edges = pd.read_csv("data_cleaned/circRNA_disease_edges.csv")
neg_edges = pd.read_csv("data_cleaned/circRNA_disease_neg_edges.csv")

pos_edges['label'] = 1
neg_edges['label'] = 0

all_edges = pd.concat([pos_edges, neg_edges], ignore_index=True)
train_edges, val_edges, test_edges = node_preserving_split(all_edges)

print("Train edges:", train_edges.shape[0])
print("Validation edges:", val_edges.shape[0])
print("Test edges:", test_edges.shape[0])

train_edges.to_csv("data_cleaned/circRNA_disease_train.csv", index=False)
val_edges.to_csv("data_cleaned/circRNA_disease_val.csv", index=False)
test_edges.to_csv("data_cleaned/circRNA_disease_test.csv", index=False)


Train edges: 929
Validation edges: 520
Test edges: 521


In [13]:
train_nodes = set(train_edges['circRNA']).union(set(train_edges['disease']))
val_test_nodes = set(val_edges['circRNA']).union(set(val_edges['disease'])).union(
                 set(test_edges['circRNA'])).union(set(test_edges['disease']))

missing_nodes = val_test_nodes - train_nodes
print("Nodes in val/test not in training:", missing_nodes)


Nodes in val/test not in training: set()


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

def node_preserving_split(edges, label_col='label', train_frac=0.8, val_frac=0.1, test_frac=0.1):
    # Ensure all nodes appear in training
    all_nodes = set(edges['miRNA']).union(set(edges['disease']))
    
    # Group by nodes
    edges['used'] = False
    train_edges = []

    # Add one edge per node to training
    for node in all_nodes:
        idx = edges[((edges['miRNA'] == node) | (edges['disease'] == node)) & (~edges['used'])].index
        if len(idx) > 0:
            edges.loc[idx[0], 'used'] = True
            train_edges.append(edges.loc[idx[0]])

    train_edges = pd.DataFrame(train_edges)
    
    # Remaining edges
    remaining_edges = edges[~edges['used']].drop(columns='used')
    
    # Split remaining edges
    val_edges, test_edges = train_test_split(
        remaining_edges, test_size=test_frac/(val_frac+test_frac), random_state=42, stratify=remaining_edges[label_col]
    )
    
    train_edges = pd.concat([train_edges, remaining_edges.drop(val_edges.index).drop(test_edges.index)])
    
    return train_edges, val_edges, test_edges

# Example usage for circRNA-disease
pos_edges = pd.read_csv("data_cleaned/miRNA_disease_edges.csv")
neg_edges = pd.read_csv("data_cleaned/miRNA_disease_neg_edges.csv")

pos_edges['label'] = 1
neg_edges['label'] = 0

all_edges = pd.concat([pos_edges, neg_edges], ignore_index=True)
train_edges, val_edges, test_edges = node_preserving_split(all_edges)

print("Train edges:", train_edges.shape[0])
print("Validation edges:", val_edges.shape[0])
print("Test edges:", test_edges.shape[0])

train_edges.to_csv("data_cleaned/miRNA_disease_train.csv", index=False)
val_edges.to_csv("data_cleaned/miRNA_disease_val.csv", index=False)
test_edges.to_csv("data_cleaned/miRNA_disease_test.csv", index=False)


Train edges: 744
Validation edges: 456
Test edges: 456


In [15]:
train_nodes = set(train_edges['miRNA']).union(set(train_edges['disease']))
val_test_nodes = set(val_edges['miRNA']).union(set(val_edges['disease'])).union(
                 set(test_edges['miRNA'])).union(set(test_edges['disease']))

missing_nodes = val_test_nodes - train_nodes
print("Nodes in val/test not in training:", missing_nodes)


Nodes in val/test not in training: set()


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

def node_preserving_split(edges, label_col='label', train_frac=0.8, val_frac=0.1, test_frac=0.1):
    # Ensure all nodes appear in training
    all_nodes = set(edges['circRNA']).union(set(edges['miRNA']))
    
    # Group by nodes
    edges['used'] = False
    train_edges = []

    # Add one edge per node to training
    for node in all_nodes:
        idx = edges[((edges['circRNA'] == node) | (edges['miRNA'] == node)) & (~edges['used'])].index

        if len(idx) > 0:
            edges.loc[idx[0], 'used'] = True
            train_edges.append(edges.loc[idx[0]])

    train_edges = pd.DataFrame(train_edges)
    
    # Remaining edges
    remaining_edges = edges[~edges['used']].drop(columns='used')
    
    # Split remaining edges
    val_edges, test_edges = train_test_split(
        remaining_edges, test_size=test_frac/(val_frac+test_frac), random_state=42, stratify=remaining_edges[label_col]
    )
    
    train_edges = pd.concat([train_edges, remaining_edges.drop(val_edges.index).drop(test_edges.index)])
    
    return train_edges, val_edges, test_edges

# Example usage for circRNA-disease
pos_edges = pd.read_csv("data_cleaned/circRNA_miRNA_edges.csv")
neg_edges = pd.read_csv("data_cleaned/circRNA_miRNA_neg_edges.csv")

pos_edges['label'] = 1
neg_edges['label'] = 0

all_edges = pd.concat([pos_edges, neg_edges], ignore_index=True)
train_edges, val_edges, test_edges = node_preserving_split(all_edges)

print("Train edges:", train_edges.shape[0])
print("Validation edges:", val_edges.shape[0])
print("Test edges:", test_edges.shape[0])

train_edges.to_csv("data_cleaned/circRNA_miRNA_train.csv", index=False)
val_edges.to_csv("data_cleaned/circRNA_miRNA_val.csv", index=False)
test_edges.to_csv("data_cleaned/circRNA_miRNA_test.csv", index=False)


Train edges: 1275
Validation edges: 258
Test edges: 259


In [17]:
train_nodes = set(train_edges['circRNA']).union(set(train_edges['miRNA']))
val_test_nodes = set(val_edges['circRNA']).union(set(val_edges['miRNA'])).union(
                 set(test_edges['circRNA'])).union(set(test_edges['miRNA']))

missing_nodes = val_test_nodes - train_nodes
print("Nodes in val/test not in training:", missing_nodes)


Nodes in val/test not in training: set()
