In [36]:
import pandas as pd

file_path = "/Users/azddza/Electromics-project/data/gtdb_taxonomy.tsv"
data = pd.read_csv(file_path, sep='\t')
unique_families = data['family'].dropna().unique().tolist()
unique_families

['Methanotrichaceae',
 'Methanospirillaceae',
 'Methanomethylophilaceae',
 'Methanomicrobiaceae',
 'Methanoculleaceae',
 'Methanobacteriaceae',
 'Sporomusaceae',
 'Dysgonomonadaceae',
 'DTU072',
 'Xanthomonadaceae',
 'JAAYUI01',
 'Hyphomicrobiaceae',
 'Moraxellaceae',
 'Rhodocyclaceae',
 'TTA-H9',
 'Tannerellaceae',
 'Cloacimonadaceae',
 'VadinHA17',
 'Anaerolineaceae',
 'Lenti-01',
 'Smithellaceae',
 'PJMF01',
 'Acutalibacteraceae',
 'UBA932',
 'Butyricicoccaceae',
 'Peptostreptococcaceae',
 'Rectinemataceae',
 'Sedimentibacteraceae',
 'Prolixibacteraceae',
 'DSUL01',
 'UBA10185',
 'UBA2241',
 'Anaerovoracaceae',
 'CAG-917',
 'Burkholderiaceae',
 'Syntrophorhabdaceae',
 'Synergistaceae',
 'CAG-698',
 'Syntrophomonadaceae',
 'DTU015',
 'UBA2242',
 'Dethiosulfovibrionaceae',
 'Vallitaleaceae',
 'Sulfurimonadaceae',
 'Desulfovibrionaceae',
 'DSM-18226',
 'Aminithiophilaceae',
 '46-47',
 'ML635J-15',
 'UBA5603',
 'Sphaerochaetaceae',
 'JAFGMB01',
 'Alkalibacteraceae',
 'Anaerosomataceae',

In [37]:
# read metadata（GTDB）
archaea_df = pd.read_csv("/Users/azddza/Electromics-project/data/ar53_metadata_r226.tsv", sep='\t')
bacteria_df = pd.read_csv("/Users/azddza/Electromics-project/data/bac120_metadata_r226.tsv", sep='\t')

gtdb_metadata = pd.concat([archaea_df, bacteria_df], ignore_index=True)

print(gtdb_metadata.columns.tolist())

['accession', 'ambiguous_bases', 'checkm2_completeness', 'checkm2_contamination', 'checkm2_model', 'checkm_completeness', 'checkm_contamination', 'checkm_marker_count', 'checkm_marker_lineage', 'checkm_marker_set_count', 'checkm_strain_heterogeneity', 'coding_bases', 'coding_density', 'contig_count', 'gc_count', 'gc_percentage', 'genome_size', 'gtdb_genome_representative', 'gtdb_representative', 'gtdb_taxonomy', 'gtdb_type_designation_ncbi_taxa', 'gtdb_type_designation_ncbi_taxa_sources', 'gtdb_type_species_of_genus', 'l50_contigs', 'l50_scaffolds', 'longest_contig', 'longest_scaffold', 'lsu_23s_contig_len', 'lsu_23s_count', 'lsu_23s_length', 'lsu_23s_query_id', 'lsu_5s_contig_len', 'lsu_5s_count', 'lsu_5s_length', 'lsu_5s_query_id', 'lsu_silva_23s_blast_align_len', 'lsu_silva_23s_blast_bitscore', 'lsu_silva_23s_blast_evalue', 'lsu_silva_23s_blast_perc_identity', 'lsu_silva_23s_blast_subject_id', 'lsu_silva_23s_taxonomy', 'mean_contig_length', 'mean_scaffold_length', 'mimag_high_qualit

In [38]:
# 合并 Archaea 和 Bacteria 数据
metadata = pd.concat([archaea_df, bacteria_df], ignore_index=True)

taxonomy_split = metadata['gtdb_taxonomy'].str.split(';', expand=True)

metadata['gtdb_family'] = taxonomy_split[4].str.replace('f__', '', regex=False)
metadata['gtdb_genus'] = taxonomy_split[5].str.replace('g__', '', regex=False)
metadata = metadata.dropna(subset=['gtdb_family', 'gtdb_genus'])

#  family → genus 
family_genus_dict = metadata.groupby("gtdb_family")["gtdb_genus"].unique().to_dict()


In [25]:
selected_family_genus = {fam: family_genus_dict.get(fam, []) for fam in unique_families if fam in family_genus_dict}
selected_family_genus

{'Methanotrichaceae': array(['Methanothrix_B', 'Methanothrix', 'Methanocrinis', 'JAFGMR01',
        'UBA204', 'MVQI01', 'JALHSR01', 'UBA114', 'Fen-7', 'JABLXE01',
        'DATMZB01', 'JAAEEW01'], dtype=object),
 'Methanospirillaceae': array(['Methanoregula', 'UBA288', 'Methanolinea_A', 'UBA9949',
        'Methanospirillum', 'CAIYHQ01', 'CAIKOD01', 'SD8', 'MVRE01',
        'JAFGOM01', 'JAPKXQ01', 'Methanolinea_B', 'Methanofilum',
        'Methanosphaerula', 'JBDKRP01', 'UBA467', 'JAJRBP01', 'KLAP105'],
       dtype=object),
 'Methanomethylophilaceae': array(['Methanoplasma', 'Methanomethylophilus', 'JBAZDV01',
        'Methanoprimaticola', 'VadinCA11', 'Methanomicula', 'JAKSHX01',
        'RumEn-M2', 'JAQUUA01', 'PWHV01', 'JALNYB01', 'Methanogranum',
        'Methanarcanum', 'SIG5', 'JAAYZC01', 'UBA9915', 'JAAYNL01'],
       dtype=object),
 'Methanomicrobiaceae': array(['Methanomicrobium', 'Methanogenium', 'Methanovulcanius', 'DSDF01',
        'DAOVRB01', 'Methanolacinia', 'Methanoplanu

In [39]:
# 扁平化所有属名
from itertools import chain
all_genus = list(set(chain.from_iterable(selected_family_genus.values())))

# 写入 accession_list.txt
with open("../results/accession_list.txt", "w") as f:
    for genus in all_genus:
        f.write(genus + "\n")

print(f"✅ 写入完成，共 {len(all_genus)} 个名称写入 accession_list.txt")

✅ 写入完成，共 3702 个名称写入 accession_list.txt


In [40]:
# 展平 genus 列表
target_genus = list(set(chain.from_iterable(selected_family_genus.values())))

# 筛选 metadata 中的 accession
target_metadata = metadata[metadata['gtdb_genus'].isin(target_genus)]

# 获取 accession 列表
accessions = target_metadata['accession'].tolist()

accessions

['GB_GCA_036773885.1',
 'GB_GCA_002499025.1',
 'GB_GCA_001316325.1',
 'GB_GCA_009778275.1',
 'GB_GCA_001560915.1',
 'RS_GCF_025992335.1',
 'GB_GCA_004332335.1',
 'GB_GCA_028700035.1',
 'GB_GCA_035392185.1',
 'RS_GCF_901111125.1',
 'GB_GCA_963589625.1',
 'RS_GCF_029972695.1',
 'GB_GCA_024167805.1',
 'RS_GCF_945868105.1',
 'GB_GCA_041668725.1',
 'RS_GCF_900316895.1',
 'GB_GCA_016839045.1',
 'GB_GCA_020855015.1',
 'GB_GCA_002495685.1',
 'GB_GCA_033810515.1',
 'GB_GCA_002504495.1',
 'GB_GCA_015063285.1',
 'GB_GCA_028700135.1',
 'GB_GCA_028700095.1',
 'RS_GCF_002067035.1',
 'GB_GCA_002502405.1',
 'RS_GCF_959607185.1',
 'GB_GCA_009780575.1',
 'GB_GCA_002497105.1',
 'RS_GCF_034480035.2',
 'RS_GCF_000151245.1',
 'GB_GCA_902496515.1',
 'GB_GCA_011391755.1',
 'GB_GCA_024399275.1',
 'GB_GCA_944320205.1',
 'GB_GCA_028685545.1',
 'GB_GCA_030156165.1',
 'GB_GCA_028679325.1',
 'RS_GCF_000762265.1',
 'GB_GCA_041676245.1',
 'GB_GCA_002505895.1',
 'GB_GCA_003157375.1',
 'RS_GCF_002208625.1',
 'GB_GCA_00

In [41]:
with open("../results/accession_list.txt", "w") as f:
    for acc in accessions:
        f.write(f"{acc}\n")