In [1]:
from collections import defaultdict, Counter

from workflow.config import PCT_VALUES
from workflow.fastani.remove_gunc_failed_contigs_by_contamination_sp_cluster import RemoveGuncFailedContigsByContaminationSpCluster
from workflow.gunc.aggregate_max_css_level_merged import AggregateMaxCssLevelMerged
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from workflow.external.gtdb_metadata import GtdbMetadataR207

In [2]:
DF_META = GtdbMetadataR207().output().read_cached()
DF_META.head()

Unnamed: 0_level_0,ambiguous_bases,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,coding_bases,coding_density,contig_count,...,gtdb_type_species_of_genus,ncbi_taxonomy,ncbi_taxonomy_unfiltered,domain,phylum,class,order,family,genus,species
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000006155.2,1916,93.12,0.0,1171,g__Bacillus (UID902),324,0.0,4305660,80.178992,426,...,f,d__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...,d__Bacteria;x__Terrabacteria group;p__Firmicut...,d__Bacteria,p__Firmicutes,c__Bacilli,o__Bacillales,f__Bacillaceae_G,g__Bacillus_A,s__Bacillus_A anthracis
GCA_000007325.1,1,99.95,0.0,149,k__Bacteria (UID2329),89,0.0,1973459,90.75461,1,...,t,d__Bacteria;p__Fusobacteria;c__Fusobacteriia;o...,d__Bacteria;p__Fusobacteria;c__Fusobacteriia;o...,d__Bacteria,p__Fusobacteriota,c__Fusobacteriia,o__Fusobacteriales,f__Fusobacteriaceae,g__Fusobacterium,s__Fusobacterium nucleatum
GCA_000007385.1,0,99.82,0.0,481,c__Gammaproteobacteria (UID4202),276,0.0,4190634,84.805944,1,...,f,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Xanthomonadales,f__Xanthomonadaceae,g__Xanthomonas,s__Xanthomonas oryzae
GCA_000008085.1,0,73.13,0.0,149,k__Archaea (UID2),107,0.0,462902,94.29948,1,...,t,d__Archaea;p__Nanoarchaeota;c__;o__Nanoarchaea...,d__Archaea;x__DPANN group;p__Nanoarchaeota;o__...,d__Archaea,p__Nanoarchaeota,c__Nanoarchaeia,o__Nanoarchaeales,f__Nanoarchaeaceae,g__Nanoarchaeum,s__Nanoarchaeum equitans
GCA_000008885.1,0,100.0,0.0,134,k__Bacteria (UID2495),80,0.0,617456,87.831079,2,...,f,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales_A,f__Enterobacteriaceae_A,g__Wigglesworthia,s__Wigglesworthia glossinidia_A


In [3]:
DF = RemoveGuncFailedContigsByContaminationSpCluster().output().read_cached()
print(DF.shape)

UNQ_GIDS = set(DF.index.get_level_values(0))

print(f'{len(UNQ_GIDS):,} failed gids')
DF.head()

(578539, 5)
35,723 failed gids


Unnamed: 0_level_0,Unnamed: 1_level_0,new_sp_rep,ani,af,type,same
gid,pct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GCA_000143435.1,1,GCF_001435955.1,97.327,0.875591,sp_rep,True
GCA_000143435.1,5,GCF_001435955.1,97.327,0.875591,sp_rep,True
GCA_000143435.1,15,GCF_001435955.1,97.6349,0.829228,sp_rep,True
GCA_000153745.1,5,,,,no_ani,True
GCA_000155005.1,1,GCF_003697165.2,96.5203,0.827668,sp_rep,True


In [22]:
def gen_data():

    # Some were originally run with more pct values, but just reduce to the analysis set
    df_changed = DF[DF['same'] == False]
    df_changed = df_changed.reset_index()
    df_changed = df_changed[df_changed['pct'].isin(PCT_VALUES)]

    d_pct_to_changes = defaultdict(list)

    for cur_pct in PCT_VALUES:
        df_subset = df_changed[df_changed['pct'] == cur_pct]
        df_subset = df_subset.sort_values(by=['gid', 'pct'], ascending=[True, False])

        for _, row in df_subset.iterrows():
            gid = row['gid']

            meta_row = DF_META.loc[gid]
            is_rep = meta_row['gtdb_representative'] == 't'

            if is_rep:
                change_type = 'rep_is_in_new_sp_cluster'
            else:
                if row['type'] == 'no_af' or row['type'] == 'no_ani':
                    change_type = 'non_rep_formed_new_cluster'
                else:
                    change_type = 'non_rep_changed_sp_cluster'

            d_pct_to_changes[cur_pct].append((change_type, gid))

    return d_pct_to_changes

data = gen_data()

print(data)

defaultdict(<class 'list'>, {1: [('rep_is_in_new_sp_cluster', 'GCA_000761495.1'), ('rep_is_in_new_sp_cluster', 'GCA_007713455.1'), ('rep_is_in_new_sp_cluster', 'GCA_018363915.1'), ('rep_is_in_new_sp_cluster', 'GCA_900548495.1'), ('rep_is_in_new_sp_cluster', 'GCA_900555225.1'), ('rep_is_in_new_sp_cluster', 'GCA_900757715.1'), ('rep_is_in_new_sp_cluster', 'GCA_900759445.1'), ('rep_is_in_new_sp_cluster', 'GCF_000438455.1'), ('rep_is_in_new_sp_cluster', 'GCF_000935215.1'), ('rep_is_in_new_sp_cluster', 'GCF_001544135.1'), ('rep_is_in_new_sp_cluster', 'GCF_001571065.1'), ('rep_is_in_new_sp_cluster', 'GCF_002156525.1'), ('rep_is_in_new_sp_cluster', 'GCF_002216875.1'), ('rep_is_in_new_sp_cluster', 'GCF_002989035.1'), ('rep_is_in_new_sp_cluster', 'GCF_003064105.1'), ('rep_is_in_new_sp_cluster', 'GCF_003935375.1'), ('rep_is_in_new_sp_cluster', 'GCF_011045835.1'), ('rep_is_in_new_sp_cluster', 'GCF_014764705.1'), ('rep_is_in_new_sp_cluster', 'GCF_016860525.1')], 5: [('rep_is_in_new_sp_cluster', 'G

In [32]:
def make_plot():


    rows = list()

    print(len(DF_META))
    d_sp_to_count = defaultdict(lambda: 0)
    for sp in DF_META['species']:
        d_sp_to_count[sp] += 1

    set_placeholder_sp = set()
    set_latin_sp = set()


    d_sp_to_rows = defaultdict(lambda: defaultdict(list))
    d_gid_to_d_pct_to_rows = defaultdict(lambda: defaultdict(list))
    for pct, lst_changes in data.items():
        for change_type, gid in lst_changes:
            meta_row = DF_META.loc[gid]
            species = meta_row['species']
            taxonomy = meta_row['gtdb_taxonomy']

            d_gid_to_d_pct_to_rows[gid][pct]= {
                'gid': gid,
                'type': change_type,
                'tax': taxonomy
            }

    out = list()

    d_pct_to_latin = defaultdict(set)
    d_pct_to_placeholder = defaultdict(set)

    for gid, d_pct in d_gid_to_d_pct_to_rows.items():
        cur_sp = DF_META.loc[gid, 'species']

        if ' sp' in cur_sp:
            set_placeholder_sp.add(cur_sp)
        else:
            set_latin_sp.add(cur_sp)

        cur_row = {
            'gid': gid,
            'species': cur_sp,
            'n_in_species': d_sp_to_count[cur_sp]
        }
        for pct, row in d_pct.items():
            cur_row[f'pct_{pct}'] = row['type']

            if ' sp' in cur_sp:
                d_pct_to_placeholder[pct].add(cur_sp)
            else:
                d_pct_to_latin[pct].add(cur_sp)

        out.append(cur_row)




    df = pd.DataFrame(out)
    print(df.head())
    df.to_csv('/tmp/sp.tsv', sep='\t', index=False)

    n_latin = len(set_latin_sp)
    n_place = len(set_placeholder_sp)
    n_species = len(set_latin_sp.union(set_placeholder_sp))

    print(f'n species: {n_species}')
    print(f'n placeholder: {n_place}/{n_species} ({n_place/n_species:.2%})')

    for pct in PCT_VALUES:
        n_latin = len(d_pct_to_latin[pct])
        n_place = len(d_pct_to_placeholder[pct])
        n_total = len(d_pct_to_latin[pct].union(d_pct_to_placeholder[pct]))
        print(f'{pct} n_place = {n_place}/{n_total} {n_place/n_total:.2%}')

    return

make_plot()

317542
               gid                       species  n_in_species  \
0  GCA_000761495.1  s__Acinetobacter idrijaensis             6   
1  GCA_007713455.1  s__Pseudomonas_A sp007713455             1   
2  GCA_018363915.1   s__Anaerococcus vaginalis_C             2   
3  GCA_900548495.1    s__Collinsella sp900548495             1   
4  GCA_900555225.1    s__Collinsella sp900555225             1   

                      pct_1                     pct_5  \
0  rep_is_in_new_sp_cluster  rep_is_in_new_sp_cluster   
1  rep_is_in_new_sp_cluster  rep_is_in_new_sp_cluster   
2  rep_is_in_new_sp_cluster  rep_is_in_new_sp_cluster   
3  rep_is_in_new_sp_cluster  rep_is_in_new_sp_cluster   
4  rep_is_in_new_sp_cluster                       NaN   

                     pct_10                    pct_15  \
0  rep_is_in_new_sp_cluster  rep_is_in_new_sp_cluster   
1  rep_is_in_new_sp_cluster  rep_is_in_new_sp_cluster   
2  rep_is_in_new_sp_cluster  rep_is_in_new_sp_cluster   
3  rep_is_in_new_sp_clust