验证集mapping.csv的PrimeKG匹配整理

In [20]:
import pandas as pd
import json
from tqdm import tqdm
import numpy as np

补充药物的图节点id和节点名，筛选缺失值

In [17]:
mapping_df = pd.read_csv("../data/benchmark/Kaggle_drug_repositioning/mapping.csv")
node_df = pd.read_csv("../data/benchmark/PrimeKG/nodes.csv")

drug_nodes = node_df[node_df["node_type"] == "drug"].copy()
drugid_to_index    = drug_nodes.set_index("node_id")["node_index"].to_dict()
drugid_to_nodename = drug_nodes.set_index("node_id")["node_name"].to_dict()

df1 = mapping_df.copy()
df1["x_index"] = df1["DrugID"].map(drugid_to_index)
df1["x_name"]  = df1["DrugID"].map(drugid_to_nodename)
df1["x_index"] = df1["x_index"].astype('Int64')

all_unique_drugids = df1["DrugID"].unique()
total_unique_drugids = len(all_unique_drugids)
unmatched_drugids = df1[df1["x_index"].isnull()]["DrugID"].unique()
unmatched_unique_drugs = len(unmatched_drugids)
unmatched_unique_drug_ratio = unmatched_unique_drugs / total_unique_drugids

print(f"总 DrugID 唯一编号数：{total_unique_drugids}")
print(f"未匹配 DrugID 唯一编号数：{unmatched_unique_drugs}")
print(f"未匹配 DrugID 唯一比例：{unmatched_unique_drug_ratio:.4f}")
print(f"未匹配 DrugID 唯一编号如下：\n{unmatched_drugids}")

print(df1.head())

总 DrugID 唯一编号数：1410
未匹配 DrugID 唯一编号数：2
未匹配 DrugID 唯一比例：0.0014
未匹配 DrugID 唯一编号如下：
['DB08845' 'DB04862']
    DrugID     DiseaseID  x_index            x_name
0  DB09140  MESH:D000013    14013            Oxygen
1  DB00730  MESH:D000013    15307     Thiabendazole
2  DB00898  MESH:D000013    14587           Ethanol
3  DB01168  MESH:D000013    14914      Procarbazine
4  DB00550  MESH:D000013    14640  Propylthiouracil


补充疾病的图节点和图id，筛选缺失值

In [18]:
with open("../data/benchmark/mondo.json", "r", encoding="utf-8") as f:
    mondo_json = json.load(f)
mondo_nodes = mondo_json["graphs"][0]["nodes"]

mesh2mondo_num = {}
for entry in mondo_nodes:
    meta = entry.get("meta", {})
    xrefs = meta.get("xrefs", [])
    mesh_ids = [xref.get("val", "") for xref in xrefs if xref.get("val", "").startswith("MESH:")]
    mesh_ids_ref = [xref.get("val", "").replace("MESH:", "") for xref in xrefs if xref.get("val", "").startswith("MESH:")]
    # 兼容 identifiers.org url 格式，以及 basicPropertyValues
    for bp in meta.get("basicPropertyValues", []):
        if bp.get("pred", "").endswith("exactMatch") and isinstance(bp.get("val", ""), str) and "mesh/D" in bp["val"]:
            mesh_id = bp["val"].split("/")[-1]  # 如 D000013
            mesh_ids_ref.append(mesh_id)
    # 提取MONDO编号（去掉前导0，保留数字）
    mondo_url = entry.get("id", "")
    if mondo_url.startswith("http://purl.obolibrary.org/obo/MONDO_"):
        mondo_num = mondo_url.split("MONDO_")[-1].lstrip("0")
    else:
        continue
    for mesh in mesh_ids + mesh_ids_ref:
        mesh_num = mesh.replace("MESH:", "").replace("D", "").lstrip("0") if mesh.startswith("D") else mesh.lstrip("0")
        mesh2mondo_num["D" + mesh_num.zfill(6)] = mondo_num
        mesh2mondo_num[mesh_num.zfill(6)] = mondo_num
        mesh2mondo_num[mesh_num] = mondo_num

disease_nodes = node_df[node_df["node_type"] == "disease"]
mondoid_to_index = disease_nodes.set_index("node_id")["node_index"].to_dict()
mondoid_to_name  = disease_nodes.set_index("node_id")["node_name"].to_dict()

def get_yinfo(disease_id):
    mesh_base = disease_id.replace("MESH:", "").replace("D", "").lstrip("0")
    mesh_key = "D" + mesh_base.zfill(6)
    mondo_num = None
    for key in [mesh_base.zfill(6), mesh_base, mesh_key]:
        if key in mesh2mondo_num:
            mondo_num = mesh2mondo_num[key]
            break
    if mondo_num is None:
        return pd.NA, pd.NA

    # node_id与mondo_num匹配（注意节点表里的node_id是去除前缀和前导0的数字字符串）
    if mondo_num in mondoid_to_index:
        return mondoid_to_index[mondo_num], mondoid_to_name[mondo_num]
    return pd.NA, pd.NA

y_indices, y_names = [], []
for disease_id in tqdm(df1["DiseaseID"], desc="匹配MONDO节点"):
    y_idx, y_nm = get_yinfo(disease_id)
    y_indices.append(y_idx)
    y_names.append(y_nm)

df1["y_index"] = y_indices
df1["y_name"] = y_names
df1["y_index"] = df1["y_index"].astype('Int64')

all_unique_diseaseids = df1["DiseaseID"].unique()
total_unique_diseaseids = len(all_unique_diseaseids)

unmatched_diseaseids = df1[df1["y_index"].isnull()]["DiseaseID"].unique()
unmatched_unique_disease = len(unmatched_diseaseids)
unmatched_unique_ratio = unmatched_unique_disease / total_unique_diseaseids

print(f"总 DiseaseID 唯一编号数：{total_unique_diseaseids}")
print(f"未匹配 DiseaseID 唯一编号数：{unmatched_unique_disease}")
print(f"未匹配 DiseaseID 唯一比例：{unmatched_unique_ratio:.4f}")

print(f"未匹配 DiseaseID 唯一编号如下：\n{unmatched_diseaseids}")

匹配MONDO节点: 100%|██████████| 42200/42200 [00:00<00:00, 1188379.56it/s]

总 DiseaseID 唯一编号数：1573
未匹配 DiseaseID 唯一编号数：727
未匹配 DiseaseID 唯一比例：0.4622
未匹配 DiseaseID 唯一编号如下：
['MESH:D000015' 'MESH:D000022' 'MESH:D000067877' 'MESH:D000068079'
 'MESH:D000070642' 'MESH:D000072660' 'MESH:D000075222' 'MESH:D000077192'
 'MESH:D000077195' 'MESH:D000077216' 'MESH:D000077273' 'MESH:D000138'
 'MESH:D000141' 'MESH:D000152' 'MESH:D000210' 'MESH:D000224'
 'MESH:D000312' 'MESH:D000361' 'MESH:D000382' 'MESH:D000402'
 'MESH:D000419' 'MESH:D000435' 'MESH:D000437' 'MESH:D000471'
 'MESH:D000506' 'MESH:D000544' 'MESH:D000647' 'MESH:D000690'
 'MESH:D000744' 'MESH:D000749' 'MESH:D000756' 'MESH:D000782'
 'MESH:D000783' 'MESH:D000784' 'MESH:D000787' 'MESH:D000853'
 'MESH:D000855' 'MESH:D000856' 'MESH:D000860' 'MESH:D001008'
 'MESH:D001010' 'MESH:D001019' 'MESH:D001049' 'MESH:D001068'
 'MESH:D001139' 'MESH:D001145' 'MESH:D001157' 'MESH:D001165'
 'MESH:D001169' 'MESH:D001171' 'MESH:D001201' 'MESH:D001228'
 'MESH:D001237' 'MESH:D001247' 'MESH:D001259' 'MESH:D001281'
 'MESH:D001284' 'MESH:D0




In [19]:
total_count = len(df1)
both_notna = df1.dropna(subset=["x_index", "y_index"])
both_count = len(both_notna)
both_ratio = both_count / total_count

print(f"x_index 和 y_index 均不缺失的样本数：{both_count} / 总数：{total_count}")
print(f"x_index 和 y_index 均不缺失的比例：{both_ratio:.4f}")

print(both_notna.head())

x_index 和 y_index 均不缺失的样本数：22146 / 总数：42200
x_index 和 y_index 均不缺失的比例：0.5248
    DrugID     DiseaseID  x_index            x_name  y_index  \
0  DB09140  MESH:D000013    14013            Oxygen    35598   
1  DB00730  MESH:D000013    15307     Thiabendazole    35598   
2  DB00898  MESH:D000013    14587           Ethanol    35598   
3  DB01168  MESH:D000013    14914      Procarbazine    35598   
4  DB00550  MESH:D000013    14640  Propylthiouracil    35598   

                   y_name  
0  congenital abnormality  
1  congenital abnormality  
2  congenital abnormality  
3  congenital abnormality  
4  congenital abnormality  


In [22]:
df1 = df1[["DrugID", "x_index", "x_name", "DiseaseID", "y_index", "y_name"]]
df2 = df1.dropna(subset=["x_index", "y_index"]).reset_index(drop=True)
print(df2.head())

    DrugID  x_index                 x_name     DiseaseID  y_index  \
0  DB00252    14141              Phenytoin  MESH:D006053    31268   
1  DB02709    14072            Resveratrol  MESH:D006323    84058   
2  DB01068    14294             Clonazepam  MESH:D008288    33128   
3  DB00900    14256             Didanosine  MESH:D010623    94787   
4  DB00688    14964  Mycophenolate mofetil  MESH:D011565    94709   

                              y_name  label  
0  oculo-auriculo-vertebral spectrum      0  
1                     cardiac arrest      0  
2                            malaria      0  
3                phencyclidine abuse      0  
4   pustulosis palmaris et plantaris      1  


  positives = df2_grouped.apply(lambda x: x.sample(min(len(x), per_group), random_state=SEED))


In [23]:
df1.to_csv("../data/benchmark/Kaggle_drug_repositioning/full_mapping.csv", index=False)
df2.to_csv("../data/benchmark/Kaggle_drug_repositioning/full_mapping_without_na.csv", index=False)