# 根据Entrez ID找出物种

In [11]:
from Bio import Entrez

def get_species_name(gene_symbol):
    # 设置 NCBI Entrez 工具的 email，这是必需的
    Entrez.email = "1525477249@qq.com"  # 替换成你自己的邮箱

    try:
        # 使用 Entrez.esearch 查询基因的 Entrez ID
        handle = Entrez.esearch(db="gene", term=gene_symbol)
        record = Entrez.read(handle)
        handle.close()

        # 如果找到了基因记录，则获取其详细信息
        if int(record["Count"]) > 0:
            gene_id = record["IdList"][0]

            # 使用 Entrez.efetch 查询基因信息
            handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
            record = Entrez.read(handle)
            handle.close()

            # 从结果中提取物种名称
            species_name = record[0]['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname']
            return species_name
        else:
            print(f"未找到基因 {gene_symbol} 的记录。")
            return "error"
    except Exception as e:
        print(f"获取{gene_symbol}物种名称时出错:", str(e))
        return "error"

# 用法示例
gene_symbol = "9530036O11Rik"
species_name = get_species_name(gene_symbol)
if species_name:
    print(f"{gene_symbol} 的物种名称是: {species_name}")
else:
    print(f"无法获取 {gene_symbol} 的物种名称。")


9530036O11Rik 的物种名称是: Mus musculus


# 下载原始数据获取speceies

In [24]:
import os
import csv
import pandas as pd
 
df = pd.read_excel('./benchmark/RNALocate.xlsx', usecols=['Gene_Name', 'Organism','Subcellular_Localization'])


with open('./benchmark/benchmark.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['name','species', 'label','seq'])

with open("./benchmark/data.txt", mode='r', encoding='utf-8') as file:
    species = None
    name = None
    seq = ""
    label = None
    match_condition = None
    lines = file.readlines()    
    for index, line in enumerate(lines):
        name = line.strip().split()[0]
        seq = line.strip().split()[1]
        label = line.strip().split()[2]
        match_condition = df.loc[(df['Gene_Name'] == name) & (df['Subcellular_Localization'] == label)]
        assert not match_condition.empty, "没有找到匹配的行"
        species = match_condition['Organism']
        with open('./benchmark/benchmark.csv', mode='a', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([name, species.tolist()[0], label,seq])