In [1]:
import pandas as pd
import mygene

# 初始化 mygene 客户端
mg = mygene.MyGeneInfo()

In [4]:
# 加载数据
df_drug2prot = pd.read_csv('drug2protein.csv', sep='\t')

print("原始数据前5行:")
print(df_drug2prot.head())

原始数据前5行:
  diseaseFromSourceMappedId      diseaseId  ...  clinicalStatus   drugId
0               EFO_0000588    EFO_0000588  ...       Completed  DB01254
1               EFO_0000222    EFO_0000222  ...       Completed  DB01254
2             MONDO_0021063  MONDO_0021063  ...       Completed  DB01254
3               EFO_0000389    EFO_0000389  ...       Completed  DB01254
4               EFO_0000220    EFO_0000220  ...  Unknown status  DB01254

[5 rows x 7 columns]


In [5]:
# 获取所有唯一的 targetId (ENSG...)
ensembl_ids = df_drug2prot['targetId'].dropna().unique().tolist()

print(f"需要转换的唯一 Ensembl Gene ID 数量: {len(ensembl_ids)}")

需要转换的唯一 Ensembl Gene ID 数量: 1381


In [6]:
def convert_ensg_to_entrez(ensg_list):
    """将 Ensembl Gene ID 列表批量转换为 Entrez Gene ID"""
    if not ensg_list:
        return {}
    results = mg.querymany(ensg_list, scopes='ensembl.gene', species='human', fields='entrezgene', returnall=True)

    mapping = {}
    for hit in results['out']:
        query = hit['query']
        entrez_id = hit.get('entrezgene', None)
        mapping[query] = entrez_id

    unmapped = results['missing']
    print(f"成功映射: {len(mapping) - len(unmapped)} / {len(ensg_list)}")
    if unmapped:
        print(f"未成功映射的数量: {len(unmapped)}")
        print(f"前5个未映射的ID: {unmapped[:5]}")

    return mapping

# 执行转换
ensg_to_entrez = convert_ensg_to_entrez(ensembl_ids)

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


成功映射: 1381 / 1381


In [7]:
# 创建一个新的列 'targetId_entrez'，存放转换后的 Entrez ID
df_drug2prot['targetId_entrez'] = df_drug2prot['targetId'].map(ensg_to_entrez)

# 检查转换结果
print("\n转换后数据前5行:")
print(df_drug2prot[['targetId', 'targetId_entrez']].head())

# 移除映射失败的行 (可选，根据您的需求决定)
# df_drug2prot = df_drug2prot.dropna(subset=['targetId_entrez']).copy()

# 重命名列：将 'targetId' 替换为转换后的 Entrez ID
# 或者，为了保留原始数据，您可以选择保留两列，但在后续分析中使用 'targetId_entrez'
df_drug2prot_final = df_drug2prot.copy()
df_drug2prot_final['targetId'] = df_drug2prot_final['targetId_entrez']  # 覆盖原列
df_drug2prot_final = df_drug2prot_final.drop(columns=['targetId_entrez'])  # 删除临时列

# 保存到新的 CSV 文件
df_drug2prot_final.to_csv('drug2protein_primekg.csv', index=False)

print(f"\n最终文件 'drug2protein_primekg.csv' 已保存，共 {len(df_drug2prot_final)} 行。")


转换后数据前5行:
          targetId targetId_entrez
0  ENSG00000000938            2268
1  ENSG00000000938            2268
2  ENSG00000000938            2268
3  ENSG00000000938            2268
4  ENSG00000000938            2268

最终文件 'drug2protein_primekg.csv' 已保存，共 117893 行。
