In [10]:
import pandas as pd
import mygene
import requests
import time

In [6]:

# 1. 加载原始数据
df_drug_target = pd.read_csv('drug2protein_primekg.csv')
print(f"原始数据行数: {len(df_drug_target)}")
print("原始数据前5行:")
print(df_drug_target.head())


原始数据行数: 117893
原始数据前5行:
  diseaseFromSourceMappedId      diseaseId  ...  clinicalStatus   drugId
0               EFO_0000588    EFO_0000588  ...       Completed  DB01254
1               EFO_0000222    EFO_0000222  ...       Completed  DB01254
2             MONDO_0021063  MONDO_0021063  ...       Completed  DB01254
3               EFO_0000389    EFO_0000389  ...       Completed  DB01254
4               EFO_0000220    EFO_0000220  ...  Unknown status  DB01254

[5 rows x 7 columns]


In [2]:
# 2. 提取所有唯一的疾病ID (合并 diseaseFromSourceMappedId 和 diseaseId)
all_disease_ids = set(df_drug_target['diseaseFromSourceMappedId'].dropna().tolist() +
                      df_drug_target['diseaseId'].dropna().tolist())
print(f"\n总共需要映射的唯一疾病ID数量: {len(all_disease_ids)}")



总共需要映射的唯一疾病ID数量: 2205


In [13]:
mapping_file = 'mondo.sssom.tsv'  # 请替换为您的实际文件路径
mapping_df = pd.read_csv(mapping_file, sep='\t', comment='#')


In [15]:
# 2. 筛选出映射到 MeSH 的记录
# MeSH ID 在 object_id 列，格式如 'mesh:D004194'
mesh_mapping_df = mapping_df[mapping_df['object_id'].str.startswith('mesh:', na=False)].copy()
mesh_mapping_df

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification
6,MONDO:0000001,disease,skos:exactMatch,mesh:D004194,,semapv:UnspecifiedMatching
14,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,mesh:D000309,,semapv:UnspecifiedMatching
27,MONDO:0000022,nocturnal enuresis,skos:exactMatch,mesh:D053206,,semapv:UnspecifiedMatching
56,MONDO:0000070,"Mycobacterium tuberculosis, susceptibility",skos:exactMatch,mesh:C536092,,semapv:UnspecifiedMatching
67,MONDO:0000082,pelvic organ prolapse,skos:exactMatch,mesh:D056887,,semapv:UnspecifiedMatching
...,...,...,...,...,...,...
108096,MONDO:8000011,"visceral neuropathy, familial, 1, autosomal re...",skos:exactMatch,mesh:C537394,,semapv:UnspecifiedMatching
108105,MONDO:8000014,familial antiphospholipid syndrome,skos:exactMatch,mesh:C531622,,semapv:UnspecifiedMatching
108111,MONDO:8000015,"46,XY sex reversal 11",skos:exactMatch,mesh:C537770,,semapv:UnspecifiedMatching
108117,MONDO:8000018,benign paroxysmal positional vertigo,skos:exactMatch,mesh:D065635,,semapv:UnspecifiedMatching


In [16]:
# 3. 清理 ID 格式，移除前缀
# 将 'MONDO:0000001' -> 'MONDO_0000001'
# 将 'mesh:D004194' -> 'D004194'
mesh_mapping_df['subject_id_clean'] = mesh_mapping_df['subject_id'].str.replace(':', '_')
mesh_mapping_df['object_id_clean'] = mesh_mapping_df['object_id'].str.replace('mesh:', '')


In [17]:
# 4. 创建映射字典
# 一个 subject_id 可能对应多个 object_id，我们取第一个
disease_to_mesh = mesh_mapping_df.groupby('subject_id_clean')['object_id_clean'].first().to_dict()


In [18]:
# 5. 验证映射
sample_ids = ['MONDO_0000001', 'MONDO_0003060', 'EFO_0000760']
for sid in sample_ids:
    print(f"{sid} -> {disease_to_mesh.get(sid, 'Not Found')}")


MONDO_0000001 -> D004194
MONDO_0003060 -> Not Found
EFO_0000760 -> Not Found


In [19]:
# 6. 应用到您的原始数据
df_drug_target = pd.read_csv('drug2protein_primekg.csv')

# 创建新列
df_drug_target['diseaseFromSourceMappedId_DrugBank'] = df_drug_target['diseaseFromSourceMappedId'].map(disease_to_mesh)
df_drug_target['diseaseId_DrugBank'] = df_drug_target['diseaseId'].map(disease_to_mesh)

# 7. 导出结果
output_file = 'drug2protein_primekg_aligned_to_drugbank_via_mondo_sssom.csv'
df_drug_target.to_csv(output_file, index=False)

print(f"\n对齐完成！结果已保存至: {output_file}")
print(f"成功映射的行数: {df_drug_target['diseaseFromSourceMappedId_DrugBank'].notnull().sum()} / {len(df_drug_target)}")


对齐完成！结果已保存至: drug2protein_primekg_aligned_to_drugbank_via_mondo_sssom.csv
成功映射的行数: 18930 / 117893
