# Generate a metadata table for genomes

### Dependencies

In [1]:
import pandas as pd

### Input files

External maps

In [2]:
# IMG IDs
img_fp = '../catalogs/img/gid2img.txt'

# GTDB IDs
gtdb_fp = '../catalogs/GTDB/r86.1/wol2gtdb.txt'

95,769 genomes in total

In [3]:
# RepoPhlAn report 
repophlan_fp = 'metrics/repophlan.tsv.xz'

# NCBI taxonomy by rank
taxonomy_fp = '../taxonomy/maps/all/rank_names.tsv.xz'

# QUAST report
quast_fp = 'metrics/quast.tsv.xz'

86,200 prokaryotic genomes with DNA

In [4]:
# prokaryotic genome list
prok_list = 'glists/fna_prok.txt'

# Prodigal report
prodigal_fp = 'metrics/prodigal.tsv.xz'

# CheckM report
checkm_fp = 'metrics/checkm.tsv.xz'

# RNAmmer report
rnammer_fp = 'metrics/rnammer.tsv.xz'

# Aragorn report
aragorn_fp = 'metrics/aragorn.tsv.xz'

# Major groups
lv1_fp = '../taxonomy/major/lv1.txt'
lv2_fp = '../taxonomy/major/lv2.txt'

# Marker gene profile
markers_fp = '../markers/phylophlan/phyletics.tsv.bz2'

# Scope
scope_fp = 'metrics/scope.txt'

10,575 genomes included in phylogeny

In [5]:
# included genome list
in_list = 'glists/in.txt'

# neighbors for the extra genomes
neighbors_fp = '../trees/expansion/neighbors.txt'

### Identifier

#### RepoPhlAn

Genomes were downloaded from NCBI RefSeq and GenBank using RepoPhlAn.

Read RepoPhlAn report

In [6]:
meta = pd.read_table(repophlan_fp, index_col=0).sort_index()
meta.shape

(95769, 36)

Select and reorder fields

In [7]:
', '.join(meta.columns)

'all_coding_data, all_data, asm_name, ass_id, assembly_accession, assembly_level, bioproject, biosample, dwlf, faa_lname, ffn_lname, fna_lname, frn_lname, ftp_path, gbrs_paired_asm, genome, genome_rep, infraspecific_name, isolate, organism_name, outdir, paired_asm_comp, refseq_category, release_type, score_faa, score_fna, score_rrna, score_trna, seq_rel_date, species_taxid, submitter, taxid, taxonomy, version_status, wgs_master, excluded_from_refseq'

In [8]:
fields = {
    'identity': ['asm_name', 'assembly_accession', 'bioproject', 'biosample', 'wgs_master', 'seq_rel_date',
                 'submitter', 'ftp_path'],
    'category': ['assembly_level', 'genome_rep', 'refseq_category', 'release_type'],
    'taxonomy': ['taxid', 'species_taxid', 'organism_name', 'infraspecific_name', 'isolate'],
    'statistics': ['score_faa', 'score_fna', 'score_rrna', 'score_trna'],
    'extra': []
}

In [9]:
def field_order():
    return fields['identity'] + fields['category'] + fields['taxonomy'] + fields['statistics'] + fields['extra']

In [10]:
meta = meta[field_order()]
meta.head(3)

Unnamed: 0_level_0,asm_name,assembly_accession,bioproject,biosample,wgs_master,seq_rel_date,submitter,ftp_path,assembly_level,genome_rep,...,release_type,taxid,species_taxid,organism_name,infraspecific_name,isolate,score_faa,score_fna,score_rrna,score_trna
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G000001985,JCVI-PMFA1-2.0,GCF_000001985.1,PRJNA32665,SAMN02953685,ABAR00000000.1,2008/10/29,J. Craig Venter Institute,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Scaffold,Full,...,Major,441960,37727,Talaromyces marneffei ATCC 18224,strain=ATCC 18224,,1.0,0.829,0.1,0.9
G000002415,ASM241v2,GCF_000002415.2,PRJNA20431,SAMN02953638,AAKM00000000.1,2009/05/06,TIGR,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Chromosome,Full,...,Major,5855,5855,Plasmodium vivax,,Salvador I,0.98,0.494,0.1,1.0
G000002495,MG8,GCF_000002495.2,PRJNA1433,SAMN02953596,AACU00000000.3,2011/10/14,International Rice Blast Genome Consortium,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Chromosome,Full,...,Major,242507,318829,Magnaporthe oryzae 70-15,strain=70-15,,1.0,0.95,0.4,1.0


#### IMG

Add IMG identifiers

In [11]:
with open(img_fp, 'r') as f:
    img = dict(x.split('\t') for x in f.read().splitlines())
len(img)

57859

In [12]:
meta['img_id'] = meta.index.to_series().map(img)
fields['identity'].append('img_id')

#### GTDB

Add GTDB r86.1 identifiers

In [13]:
with open(gtdb_fp, 'r') as f:
    gtdb = dict(x.split('\t') for x in f.read().splitlines())
len(gtdb)

124185

In [14]:
meta['gtdb_id'] = meta.index.to_series().map(gtdb)
fields['identity'].append('gtdb_id')

In [15]:
meta = meta[field_order()]

#### Scope

In [16]:
scope = pd.read_table(scope_fp, index_col=0, names=['scope'])
scope['scope'].value_counts()

Monoisolate     58508
Multiisolate    12908
Multispecies     8720
Environment      5753
SingleCell        215
Other              70
Synthetic           4
Name: scope, dtype: int64

In [17]:
fields['identity'].append('scope')

In [18]:
meta = pd.concat([meta, scope], axis=1, sort=False)[field_order()]
meta[fields['identity']].head(3)

Unnamed: 0,asm_name,assembly_accession,bioproject,biosample,wgs_master,seq_rel_date,submitter,ftp_path,img_id,gtdb_id,scope
G000001985,JCVI-PMFA1-2.0,GCF_000001985.1,PRJNA32665,SAMN02953685,ABAR00000000.1,2008/10/29,J. Craig Venter Institute,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,,
G000002415,ASM241v2,GCF_000002415.2,PRJNA20431,SAMN02953638,AAKM00000000.1,2009/05/06,TIGR,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,,
G000002495,MG8,GCF_000002495.2,PRJNA1433,SAMN02953596,AACU00000000.3,2011/10/14,International Rice Blast Genome Consortium,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,,,


### Taxonomy

#### NCBI taxonomy

Add NCBI taxonomy

In [19]:
rank_names = pd.read_table(taxonomy_fp, index_col=0).drop('kingdom', axis=1)
rank_names.head(3)

Unnamed: 0_level_0,superkingdom,phylum,class,order,family,genus,species
genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G000001985,Eukaryota,Ascomycota,Eurotiomycetes,Eurotiales,Trichocomaceae,Talaromyces,Talaromyces marneffei
G000002415,Eukaryota,Apicomplexa,Aconoidasida,Haemosporida,Plasmodiidae,Plasmodium,Plasmodium vivax
G000002495,Eukaryota,Ascomycota,Sordariomycetes,Magnaporthales,Magnaporthaceae,Magnaporthe,Magnaporthe oryzae


In [20]:
fields['taxonomy'] += rank_names.columns.tolist()

In [21]:
meta = pd.concat([meta, rank_names], axis=1, sort=False)[field_order()]
meta[fields['taxonomy']].head()

Unnamed: 0_level_0,taxid,species_taxid,organism_name,infraspecific_name,isolate,superkingdom,phylum,class,order,family,genus,species
genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
G000001985,441960,37727,Talaromyces marneffei ATCC 18224,strain=ATCC 18224,,Eukaryota,Ascomycota,Eurotiomycetes,Eurotiales,Trichocomaceae,Talaromyces,Talaromyces marneffei
G000002415,5855,5855,Plasmodium vivax,,Salvador I,Eukaryota,Apicomplexa,Aconoidasida,Haemosporida,Plasmodiidae,Plasmodium,Plasmodium vivax
G000002495,242507,318829,Magnaporthe oryzae 70-15,strain=70-15,,Eukaryota,Ascomycota,Sordariomycetes,Magnaporthales,Magnaporthaceae,Magnaporthe,Magnaporthe oryzae
G000002515,28985,28985,Kluyveromyces lactis,strain=NRRL Y-1140,,Eukaryota,Ascomycota,Saccharomycetes,Saccharomycetales,Saccharomycetaceae,Kluyveromyces,Kluyveromyces lactis
G000002525,284591,4952,Yarrowia lipolytica CLIB122,strain=CLIB122,,Eukaryota,Ascomycota,Saccharomycetes,Saccharomycetales,Dipodascaceae,Yarrowia,Yarrowia lipolytica


#### Unclassified taxa

"Unclassified taxa" are defined as those without phylum to genus classifications.

In [22]:
classified = meta[['phylum', 'class', 'order', 'family', 'genus']].dropna(how='all').index.tolist()

In [23]:
meta['classified'] = meta.index.isin(classified)
meta['classified'].value_counts()

True     93571
False     2198
Name: classified, dtype: int64

In [24]:
fields['taxonomy'].append('classified')

#### Unique name

The following columns matter:

In [25]:
meta[['organism_name', 'infraspecific_name', 'isolate', 'asm_name']].head()

Unnamed: 0_level_0,organism_name,infraspecific_name,isolate,asm_name
genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G000001985,Talaromyces marneffei ATCC 18224,strain=ATCC 18224,,JCVI-PMFA1-2.0
G000002415,Plasmodium vivax,,Salvador I,ASM241v2
G000002495,Magnaporthe oryzae 70-15,strain=70-15,,MG8
G000002515,Kluyveromyces lactis,strain=NRRL Y-1140,,ASM251v1
G000002525,Yarrowia lipolytica CLIB122,strain=CLIB122,,ASM252v1


Generate a full name for each genome

In [26]:
def assign_full_name(row):
    name = row.organism_name
    
    # infraspecific name
    infra = row.infraspecific_name
    if pd.notnull(infra):
        if not '=' in infra:
            raise ValueError('Invalid infraspecific name: %s.' % infra)
        kind, spp = infra.split('=', 1)
        if kind not in ('strain', 'cultivar', 'ecotype'):
            raise ValueError('Invalid infraspecific name: %s.' % infra)
        if ' (=' in spp:
            spp = spp.split(' (=')[0]
        if ';' in spp:
            spp = spp.split(';')[0]
        if not name.endswith(' %s' % spp):
            name += ' %s' % spp

    # isolate
    isolate = row.isolate
    if pd.notnull(isolate):
        if not name.endswith(' %s' % isolate):
            name += ' isolate %s' % isolate

    # asm name
    if name == row.species:
        if row.asm_name not in (row.species, row.organism_name, row.isolate):
            name += ' %s' % row.asm_name

    return name

In [27]:
meta['unique_name'] = meta.apply(assign_full_name, axis=1)
meta['unique_name'].head()

genome
G000001985       Talaromyces marneffei ATCC 18224
G000002415    Plasmodium vivax isolate Salvador I
G000002495               Magnaporthe oryzae 70-15
G000002515       Kluyveromyces lactis NRRL Y-1140
G000002525            Yarrowia lipolytica CLIB122
Name: unique_name, dtype: object

But there are still duplicates

In [28]:
meta['name_is_dup'] = meta['unique_name'].duplicated(keep=False)
meta['name_is_dup'].value_counts()

False    92684
True      3085
Name: name_is_dup, dtype: int64

Append ASM name to duplicates

In [29]:
def append_asm_name(row):
    name = row.unique_name
    if row.name_is_dup is True:
        if row.asm_name not in (row.species, row.organism_name, row.isolate):
            name += ' %s' % row.asm_name
    return name

In [30]:
meta['unique_name'] = meta.apply(append_asm_name, axis=1)
meta['name_is_dup'] = meta['unique_name'].duplicated(keep=False)
meta['name_is_dup'].value_counts()

False    95710
True        59
Name: name_is_dup, dtype: int64

Finally, append genome ID to duplicates

In [31]:
def append_genome_id(row):
    name = row.unique_name
    if row.name_is_dup is True:
            name += ' (%s)' % row.name
    return name

In [32]:
meta['unique_name'] = meta.apply(append_genome_id, axis=1)
meta['name_is_dup'] = meta['unique_name'].duplicated(keep=False)
meta['name_is_dup'].value_counts()

False    95769
Name: name_is_dup, dtype: int64

In [33]:
meta.drop('name_is_dup', axis=1, inplace=True)
fields['taxonomy'].append('unique_name')

In [34]:
meta['unique_name'].to_csv('naming.tsv', sep='\t')

#### Major groups

In [35]:
lv1 = pd.read_table(lv1_fp, index_col=0, names=['lv1_group'])
lv1['lv1_group'].value_counts()

Eubacteria    82720
CPR            2356
Archaea        1124
Name: lv1_group, dtype: int64

In [36]:
lv2 = pd.read_table(lv2_fp, index_col=0, names=['lv2_group'])
lv2['lv2_group'].value_counts()

Proteobacteria    37109
Firmicutes        28867
Actinobacteria    10854
Bacteroidetes      2091
Parcubacteria      1356
Bacteria            963
Spirochaetes        743
Euryarchaeota       708
Microgenomates      705
Terrabacteria       519
Cyanobacteria       479
PVC                 305
CPR                 295
FCB                 291
Chlamydiae          289
Chloroflexi         210
Crenarchaeota       188
TACK                129
DPANN                57
Archaea              32
Asgard               10
Name: lv2_group, dtype: int64

In [37]:
fields['taxonomy'] += ['lv1_group', 'lv2_group']

In [38]:
meta = pd.concat([meta, lv1, lv2], axis=1, sort=False)[field_order()]
meta[fields['taxonomy']].head(3)

Unnamed: 0,taxid,species_taxid,organism_name,infraspecific_name,isolate,superkingdom,phylum,class,order,family,genus,species,classified,unique_name,lv1_group,lv2_group
G000001985,441960,37727,Talaromyces marneffei ATCC 18224,strain=ATCC 18224,,Eukaryota,Ascomycota,Eurotiomycetes,Eurotiales,Trichocomaceae,Talaromyces,Talaromyces marneffei,True,Talaromyces marneffei ATCC 18224,,
G000002415,5855,5855,Plasmodium vivax,,Salvador I,Eukaryota,Apicomplexa,Aconoidasida,Haemosporida,Plasmodiidae,Plasmodium,Plasmodium vivax,True,Plasmodium vivax isolate Salvador I,,
G000002495,242507,318829,Magnaporthe oryzae 70-15,strain=70-15,,Eukaryota,Ascomycota,Sordariomycetes,Magnaporthales,Magnaporthaceae,Magnaporthe,Magnaporthe oryzae,True,Magnaporthe oryzae 70-15,,


### Quality

#### QUAST

Assembly statistics computed by QUAST

In [39]:
quast = pd.read_table(quast_fp, index_col=0)
quast.shape

(95639, 21)

In [40]:
', '.join(quast.columns)

"# contigs (>= 0 bp), # contigs (>= 1000 bp), # contigs (>= 5000 bp), # contigs (>= 10000 bp), # contigs (>= 25000 bp), # contigs (>= 50000 bp), Total length (>= 0 bp), Total length (>= 1000 bp), Total length (>= 5000 bp), Total length (>= 10000 bp), Total length (>= 25000 bp), Total length (>= 50000 bp), # contigs, Largest contig, Total length, GC (%), N50, N75, L50, L75, # N's per 100 kbp"

In [41]:
quast = quast[['Total length', '# contigs', 'GC (%)', 'N50', 'L50']]

In [42]:
fields['statistics'] += quast.columns.tolist()

In [43]:
meta = pd.concat([meta, quast], axis=1, sort=False)[field_order()]
meta[fields['statistics']].head()

Unnamed: 0,score_faa,score_fna,score_rrna,score_trna,Total length,# contigs,GC (%),N50,L50
G000001985,1.0,0.829,0.1,0.9,28643865.0,452.0,46.67,3339384.0,4.0
G000002415,0.98,0.494,0.1,1.0,27013691.0,2748.0,42.28,1678596.0,6.0
G000002495,1.0,0.95,0.4,1.0,40979121.0,53.0,51.61,6606598.0,3.0
G000002515,0.99,0.994,0.3,1.0,10729447.0,7.0,38.72,1753957.0,3.0
G000002525,1.0,0.991,1.0,1.0,20550897.0,7.0,48.99,3633272.0,3.0


#### Prokaryotes (86k)

Downsample to 86K prokaryotic genomes

In [44]:
with open(prok_list, 'r') as f:
    gs = f.read().splitlines()

In [45]:
meta = meta[meta.index.isin(gs)]
meta.shape

(86200, 40)

In [46]:
for field in ['Total length', '# contigs', 'N50', 'L50']:
    meta[field] = meta[field].astype(int)

#### Prodigal

ORFs identified by Prodigal

In [47]:
prodigal = pd.read_table(prodigal_fp, index_col=0)
prodigal.shape

(86200, 3)

In [48]:
', '.join(prodigal.columns)

'prots, aa, bp'

In [49]:
prodigal.columns = ['# proteins', 'Protein length', 'Coding length']

In [50]:
meta = pd.concat([meta, prodigal], axis=1, sort=False)

Calculate coding density

In [51]:
meta['Coding density'] = meta['Coding length'] / meta['Total length'] * 100

In [52]:
meta.drop('Coding length', axis=1, inplace=True)
fields['statistics'] += ['# proteins', 'Protein length', 'Coding density']

#### CheckM

Bin quality statistics measured by CheckM

In [53]:
checkm = pd.read_table(checkm_fp, index_col=0)
checkm.shape

  interactivity=interactivity, compiler=compiler, result=result)


(86200, 29)

In [54]:
', '.join(checkm.columns)

'Marker lineage, # genomes, # markers, # marker sets, Completeness, Contamination, Strain heterogeneity, Genome size (bp), # ambiguous bases, # scaffolds, # contigs, N50 (scaffolds), N50 (contigs), Mean scaffold length (bp), Mean contig length (bp), Longest scaffold (bp), Longest contig (bp), GC, GC std (scaffolds > 1kbp), Coding density, Translation table, # predicted genes, 0, 1, 2, 3, 4, 5+, Unnamed: 29'

In [55]:
checkm = checkm[['Completeness', 'Contamination', 'Strain heterogeneity']]

In [56]:
fields['statistics'] += checkm.columns.tolist()

In [57]:
meta = pd.concat([meta, checkm], axis=1, sort=False)[field_order()]
meta[fields['statistics']].head()

Unnamed: 0,score_faa,score_fna,score_rrna,score_trna,Total length,# contigs,GC (%),N50,L50,# proteins,Protein length,Coding density,Completeness,Contamination,Strain heterogeneity
G000003135,1.0,0.631,1.0,1.0,2396359,114,60.25,323391,3,2002,714009,87.798698,100.0,0.0,0.0
G000003215,0.96,0.93,0.9,0.7,4127750,15,28.52,4094363,1,3688,1171790,83.639586,99.51,0.37,0.0
G000003645,1.0,0.685,0.4,1.0,5269725,1,35.34,5269725,1,5382,1522268,85.09019,99.07,0.02,0.0
G000003925,1.0,0.735,1.0,1.0,5561906,1,35.21,5561906,1,5690,1566356,82.952589,99.32,0.02,0.0
G000003955,1.0,0.583,1.0,1.0,5790501,1,35.24,5790501,1,6005,1647986,83.829327,97.17,0.76,0.0


#### Marker profile

Single-copy marker genes identified by PhyloPhlAn

In [58]:
markers = pd.read_table(markers_fp, index_col=0).sum(axis=1)
len(markers)

86200

In [59]:
meta['# markers'] = meta.index.to_series().map(markers)
fields['statistics'].append('# markers')

#### RNAmmer

rRNA genes identified by RNAmmer

In [60]:
rnammer = pd.read_table(rnammer_fp, index_col=0)
rnammer.head()

Unnamed: 0_level_0,5s_rRNA,16s_rRNA,23s_rRNA
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000003135,1,1,1
G000003215,3,5,3
G000003645,3,1,1
G000003925,5,1,1
G000003955,5,1,1


In [61]:
fields['statistics'] += rnammer.columns.tolist()
meta = pd.concat([meta, rnammer.astype(bool).replace({True: 'yes', False: 'no'})],
                 axis=1, sort=False)[field_order()]

#### Aragorn

tRNA genes identified by Aragorn

In [62]:
aragorn = pd.read_table(aragorn_fp, index_col=0)
aragorn.head()

Unnamed: 0_level_0,tRNA-???,tRNA-Ala,tRNA-Arg,tRNA-Asn,tRNA-Asp,tRNA-Cys,tRNA-Gln,tRNA-Glu,tRNA-Gly,tRNA-His,...,tRNA-Pyl,tRNA-SeC,tRNA-Ser,tRNA-Stop,tRNA-Thr,tRNA-Trp,tRNA-Tyr,tRNA-Val,tmRNA,tmRNA*
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G000003135,0,4,5,3,2,1,2,2,5,1,...,0,0,4,0,5,1,1,5,1,0
G000003215,0,0,1,0,0,2,1,1,3,2,...,2,1,4,0,2,1,1,2,1,0
G000003645,0,3,2,4,4,1,4,7,7,1,...,0,0,4,0,3,2,3,6,1,0
G000003925,0,4,4,5,4,1,4,7,7,1,...,0,0,5,0,5,2,3,6,1,0
G000003955,0,4,4,5,6,1,4,7,8,2,...,0,0,7,0,6,2,4,5,1,0


In [63]:
aa20 = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile',
        'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']
aragorn = aragorn[['tRNA-%s' % x for x in aa20]]

In [64]:
aragorn = aragorn.astype(bool).sum(axis=1).to_dict()

In [65]:
fields['statistics'].append('# tRNAs')
meta['# tRNAs'] = meta.index.to_series().map(aragorn)

#### Draft quality

Attempt to assign a term that describes the quality of a draft genome. Can be "high", "medium", "low" or "unmet". The criteria for assigning these terms follow the **MISAG and MIMAG standard** established in [Bowers et al. (2017)](https://www.nature.com/articles/nbt.3893). Specifically:

  - *high*: completeness > 90%, contamination < 5%, presence of 23S, 16S, 5S rRNAs and >= 18 tRNAs.
  - *medium*: completeness >= 50%, contamination < 10%
  - *low*: completeness < 50%, contamination < 10%
  - *unmet*: contamination >= 10%

Note that in addition to these criteria, the original MISAG and MIMAG standard requires reference-guided review of metagenome assemblies and bins, which does not apply here. Therefore, please treat this information with caution.

Also note that we do not attempt to judge whether a genome is "finished". Please refer to column "assembly_level" for this information.

In [66]:
def draft_quality(row):
    if (row['5s_rRNA'] == 'yes' and row['16s_rRNA'] == 'yes' and row['23s_rRNA'] == 'yes'
          and row['# tRNAs'] >= 18 and row['Completeness'] > 90 and row['Contamination'] < 5):
        return 'high'
    elif row['Contamination'] < 10:
        return 'medium' if row['Completeness'] >= 50 else 'low'
    else:
        return 'unmet'

In [67]:
meta['draft_quality'] = meta.apply(draft_quality, axis=1)
fields['statistics'].append('draft_quality')

### Phylogeny

#### in-tree (11k)

In [68]:
with open(in_list, 'r') as f:
    gs = f.read().splitlines()

In [69]:
meta['selected'] = meta.index.to_series().apply(lambda x: x in gs)

In [70]:
fields['extra'].append('selected')

#### Neighbor

For each of the 75,626 genomes that were not selected for phylogenetic reconstruction, a closest neighbor was identified based on MinHash signature, so that they can be mapped to leaves in the tree.

Note that 24 genomes don't have neighbors because they are too separated in the biodiversity space (distance = 1.0 to any genome). All other genomes have distance < 0.3.

In [71]:
with open(neighbors_fp, 'r') as f:
    neighbors = dict(x.split('\t') for x in f.read().splitlines())
for g in gs:
    neighbors[g] = 'self'

In [72]:
meta['neighbor'] = meta.index.to_series().map(neighbors)
fields['extra'].append('neighbor')

### Export metadata

In [73]:
meta = meta[field_order()]
meta.index.names = ['#genome']
d = {x: x.lower().replace('# ', '').replace(' ', '_') for x in meta.columns}
d['GC (%)'] = 'gc'
meta.rename(columns=d, inplace=True)
meta.head()

Unnamed: 0_level_0,asm_name,assembly_accession,bioproject,biosample,wgs_master,seq_rel_date,submitter,ftp_path,img_id,gtdb_id,...,contamination,strain_heterogeneity,markers,5s_rrna,16s_rrna,23s_rrna,trnas,draft_quality,selected,neighbor
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G000003135,ASM313v1,GCF_000003135.1,PRJNA224116,SAMN00001475,ACHI00000000.1,2009/05/05,Baylor College of Medicine,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,643886140,RS_GCF_000003135.1,...,0.0,0.0,295,yes,yes,yes,20,high,False,G001025175
G000003215,ASM321v1,GCF_000003215.1,PRJNA224116,SAMN02470531,ABFD00000000.2,2009/01/30,McGill University,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,645058803,RS_GCF_000003215.1,...,0.37,0.0,344,yes,yes,yes,17,medium,False,G000009205
G000003645,ASM364v1,GCF_000003645.1,PRJNA224116,SAMN00717290,ACLS00000000.1,2009/05/29,Naval Medical Research Center,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,643886042,RS_GCF_000003645.1,...,0.02,0.0,381,yes,yes,yes,20,high,False,G000008165
G000003925,ASM392v1,GCF_000003925.1,PRJNA224116,SAMN00727646,ACMU00000000.1,2009/05/29,Naval Medical Research Center,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,643886175,RS_GCF_000003925.1,...,0.02,0.0,381,yes,yes,yes,20,high,False,G000291295
G000003955,ASM395v1,GCF_000003955.1,PRJNA224116,SAMN00727677,ACMT00000000.1,2009/05/29,Naval Medical Research Center,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,643886176,RS_GCF_000003955.1,...,0.76,0.0,380,yes,yes,yes,20,high,False,G000291295


In [74]:
meta.to_csv('metadata.ext.tsv', sep='\t')

In [75]:
meta = meta[meta.index.isin(gs)].drop(['selected', 'neighbor'], axis=1)
meta.to_csv('metadata.tsv', sep='\t')