In [1]:
import pandas as pd
import numpy as np

# Load positive edges
circ_dis_edges = pd.read_csv("data_cleaned/circRNA_disease_edges.csv")
mir_dis_edges = pd.read_csv("data_cleaned/miRNA_disease_edges.csv")
circ_mir_edges = pd.read_csv("data_cleaned/circRNA_miRNA_edges.csv")

# Function to create negative samples
def negative_sampling(edges_df, source_nodes, target_nodes, source_col, target_col, seed=42):
    """
    Generate negative samples for link prediction.

    Args:
        edges_df: DataFrame of positive edges
        source_nodes: list of all possible source nodes
        target_nodes: list of all possible target nodes
        source_col: name of source column
        target_col: name of target column
        seed: random seed for reproducibility

    Returns:
        DataFrame of negative edges
    """
    np.random.seed(seed)
    
    # Convert positive edges to set for fast lookup
    pos_set = set(zip(edges_df[source_col], edges_df[target_col]))
    
    neg_edges = []
    while len(neg_edges) < len(edges_df):
        src = np.random.choice(source_nodes)
        tgt = np.random.choice(target_nodes)
        if (src, tgt) not in pos_set:
            neg_edges.append((src, tgt))
    
    neg_df = pd.DataFrame(neg_edges, columns=[source_col, target_col])
    return neg_df

# Get all nodes from your cleaned node lists
circ_nodes = pd.read_csv("data_cleaned/circRNA_nodes_clean.csv")['circRNA'].tolist()
mir_nodes = pd.read_csv("data_cleaned/miRNA_nodes_clean.csv")['miRNA'].tolist()
dis_nodes = pd.read_csv("data_cleaned/disease_nodes_clean.csv")['disease'].tolist()

# Generate negative samples
circ_dis_neg = negative_sampling(circ_dis_edges, circ_nodes, dis_nodes, 'circRNA', 'disease')
mir_dis_neg = negative_sampling(mir_dis_edges, mir_nodes, dis_nodes, 'miRNA', 'disease')
circ_mir_neg = negative_sampling(circ_mir_edges, circ_nodes, mir_nodes, 'circRNA', 'miRNA')

# Save negative edges
circ_dis_neg.to_csv("data_cleaned/circRNA_disease_neg_edges.csv", index=False)
mir_dis_neg.to_csv("data_cleaned/miRNA_disease_neg_edges.csv", index=False)
circ_mir_neg.to_csv("data_cleaned/circRNA_miRNA_neg_edges.csv", index=False)

print("Negative sampling done:")
print("circRNA-disease:", circ_dis_neg.shape[0], "edges")
print("miRNA-disease:", mir_dis_neg.shape[0], "edges")
print("circRNA-miRNA:", circ_mir_neg.shape[0], "edges")


Negative sampling done:
circRNA-disease: 985 edges
miRNA-disease: 828 edges
circRNA-miRNA: 896 edges


In [2]:
pos_edges = pd.read_csv("data_cleaned/miRNA_disease_edges.csv")
neg_edges = pd.read_csv("data_cleaned/miRNA_disease_neg_edges.csv")

# Check for overlaps
overlap = pd.merge(pos_edges, neg_edges, on=['miRNA', 'disease'], how='inner')

print("Number of negative edges that are actually positive:", len(overlap))
if len(overlap) > 0:
    print("Overlapping edges:\n", overlap)
else:
    print("No overlaps found ✅")


Number of negative edges that are actually positive: 0
No overlaps found ✅
