In [1]:
import pandas as pd
from collections import defaultdict

from workflow.external.gtdb_metadata import GtdbMetadataR207
from workflow.external.gtdb_metadata import GtdbMetadataR207Full
from workflow.gunc.aggregate_max_css_level_gunc import AggregateMaxCssLevelGtdbR95

import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from workflow.gunc.aggregate_max_css_level_merged import AggregateMaxCssLevelMerged
from workflow.gunc.aggregate_max_css_level_progenomes import AggregateMaxCssLevelProGenomes


In [2]:
def get_df():
    df_pro = AggregateMaxCssLevelProGenomes().output().read_cached()
    df_gtdb = AggregateMaxCssLevelGtdbR95().output().read_cached()
    df_meta = GtdbMetadataR207Full().output().read_cached()

    df_merged = df_pro.merge(df_gtdb, how='inner', left_index=True, right_index=True, suffixes=('_pro', '_gtdb'))
    df_merged = df_merged.merge(df_meta, how='inner', left_index=True, right_index=True)
    return df_merged

DF = get_df()

In [3]:
print(DF.shape)
DF.head()

(317542, 140)


Unnamed: 0_level_0,n_genes_called_pro,n_genes_mapped_pro,n_contigs_pro,taxonomic_level_pro,proportion_genes_retained_in_major_clades_pro,genes_retained_index_pro,clade_separation_score_pro,contamination_portion_pro,n_effective_surplus_clades_pro,mean_hit_identity_pro,...,trna_aa_count,trna_count,trna_selenocysteine_count,domain,phylum,class,order,family,genus,species
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000006155.2,5745,5627,3,kingdom,1.0,0.98,0.0,0.0,0.0,0.97,...,16,31,0,d__Bacteria,p__Firmicutes,c__Bacilli,o__Bacillales,f__Bacillaceae_G,g__Bacillus_A,s__Bacillus_A anthracis
GCA_000007325.1,2022,2016,1,kingdom,1.0,1.0,0.0,0.0,0.0,0.98,...,20,47,0,d__Bacteria,p__Fusobacteriota,c__Fusobacteriia,o__Fusobacteriales,f__Fusobacteriaceae,g__Fusobacterium,s__Fusobacterium nucleatum
GCA_000007385.1,4831,4724,1,kingdom,1.0,0.98,0.0,0.0,0.0,0.98,...,20,53,0,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Xanthomonadales,f__Xanthomonadaceae,g__Xanthomonas,s__Xanthomonas oryzae
GCA_000008085.1,583,545,1,kingdom,1.0,0.93,0.0,0.0,0.0,0.98,...,17,38,0,d__Archaea,p__Nanoarchaeota,c__Nanoarchaeia,o__Nanoarchaeales,f__Nanoarchaeaceae,g__Nanoarchaeum,s__Nanoarchaeum equitans
GCA_000008885.1,646,643,2,kingdom,1.0,1.0,0.0,0.0,0.0,0.97,...,20,34,0,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales_A,f__Enterobacteriaceae_A,g__Wigglesworthia,s__Wigglesworthia glossinidia_A


In [17]:
def pass_vs_fail_category():

    # Set masks
    mask_failed_pro = DF['pass.GUNC_pro'] == False
    mask_failed_gtdb = DF['pass.GUNC_gtdb'] == False

    mask_failed_all = mask_failed_pro | mask_failed_gtdb
    mask_pass_all = ~mask_failed_all

    df_failed_all = DF[mask_failed_all]

    print(len(df_failed_all))

    all_counts = Counter(DF['ncbi_genome_category'])
    all_data = {
        'Isolate': all_counts['none'] +  all_counts['derived from environmental sample'],
        # 'ENV': all_counts['derived from environmental sample'],
        'MAG': all_counts['derived from metagenome'],
        'SAG': all_counts['derived from single cell'],
    }

    fail_all_counts = Counter(df_failed_all['ncbi_genome_category'])
    fail_all_data = {
        'Isolate': fail_all_counts['none'] + fail_all_counts['derived from environmental sample'],
        # 'ENV': fail_counts['derived from environmental sample'],
        'MAG': fail_all_counts['derived from metagenome'],
        'SAG': fail_all_counts['derived from single cell'],
    }

    fail_pro_counts = Counter(DF[mask_failed_pro]['ncbi_genome_category'])
    fail_pro_data = {
        'Isolate': fail_pro_counts['none'] + fail_pro_counts['derived from environmental sample'],
        # 'ENV': fail_counts['derived from environmental sample'],
        'MAG': fail_pro_counts['derived from metagenome'],
        'SAG': fail_pro_counts['derived from single cell'],
    }

    fail_gtdb_counts = Counter(DF[mask_failed_gtdb]['ncbi_genome_category'])
    fail_gtdb_data = {
        'Isolate': fail_gtdb_counts['none'] + fail_gtdb_counts['derived from environmental sample'],
        # 'ENV': fail_counts['derived from environmental sample'],
        'MAG': fail_gtdb_counts['derived from metagenome'],
        'SAG': fail_gtdb_counts['derived from single cell'],
    }

    print(all_data, sum(all_data.values()))
    print({k: round(100*(v/sum(all_data.values())), 1) for k, v in all_data.items()})
    print()
    print(fail_all_data, sum(fail_all_data.values()))
    print({k: round(100*(v/sum(fail_all_data.values())), 1) for k, v in fail_all_data.items()})
    print()
    print(fail_pro_data, sum(fail_pro_data.values()))
    print({k: round(100*(v/sum(fail_pro_data.values())), 1) for k, v in fail_pro_data.items()})
    print()
    print(fail_gtdb_data, sum(fail_gtdb_data.values()))
    print({k: round(100*(v/sum(fail_gtdb_data.values())), 1) for k, v in fail_gtdb_data.items()})
    print()


    return

pass_vs_fail_category()

35723
{'Isolate': 238820, 'MAG': 77891, 'SAG': 831} 317542
{'Isolate': 75.21, 'MAG': 24.53, 'SAG': 0.26}

{'Isolate': 23825, 'MAG': 11828, 'SAG': 70} 35723
{'Isolate': 66.7, 'MAG': 33.1, 'SAG': 0.2}

{'Isolate': 11821, 'MAG': 5405, 'SAG': 52} 17278
{'Isolate': 68.4, 'MAG': 31.3, 'SAG': 0.3}

{'Isolate': 19049, 'MAG': 9773, 'SAG': 36} 28858
{'Isolate': 66.0, 'MAG': 33.9, 'SAG': 0.1}

