This is a notebook that use the FLOCELLID_taxa.tab file in the WGS folder to generate ncbi hit plot for the four flowcells.

In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
BASEDIR = '/home/yiheng/analysis/WGS'

In [3]:
# get the summary dataframe
FC1_taxa = os.path.join(BASEDIR, 'run1_taxa.tab')
FC2_taxa = os.path.join(BASEDIR, 'run2_taxa.tab')
#FC3_taxa = os.path.join(BASEDIR, 'FAH05432_taxa.tab')
FC4_taxa = os.path.join(BASEDIR, 'FAH05731_taxa.tab')

FC1_taxa_df = pd.read_csv(FC1_taxa, sep = '\t')
FC2_taxa_df = pd.read_csv(FC2_taxa, sep = '\t')
#FC3_taxa_df = pd.read_csv(FC3_taxa, sep = '\t')
FC4_taxa_df = pd.read_csv(FC4_taxa, sep = '\t')

In [4]:
joint_taxa_df = pd.concat([FC1_taxa_df,  
                           FC2_taxa_df, 
                           FC4_taxa_df], ignore_index=True)

In [5]:
joint_taxa_df

Unnamed: 0,read_id,barcode_arrangement,staxids_nt,superkingdom,phylum,class,order,family,genus,species,Flowcell
0,03fe0b0a-1fb6-4102-909b-443d842f1fc9,barcode01,43300,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Cloning vector lambda TXF97,run1
1,ea8614ad-f6b7-4cf8-a584-4b784b761dd2,barcode01,33069,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas viridiflava,run1
2,74bf46de-9d7f-4d5e-8422-4395d4d36572,barcode01,1276538,Eukaryota,Ascomycota,Dothideomycetes,Capnodiales,Mycosphaerellaceae,Zymoseptoria,Zymoseptoria tritici,run1
3,e98f757d-647b-4dfd-91d7-694c7b0ab12a,barcode01,1047171,Eukaryota,Ascomycota,Dothideomycetes,Capnodiales,Mycosphaerellaceae,Zymoseptoria,Zymoseptoria tritici,run1
4,2149e363-9367-4d32-b3af-996b77a5e99a,barcode01,4565,Eukaryota,Streptophyta,Liliopsida,Poales,Poaceae,Triticum,Triticum aestivum,run1
5,40eae7a5-f0b0-4a73-bc4f-3f191a2110b1,barcode01,102285,Eukaryota,Platyhelminthes,Cestoda,Cyclophyllidea,Hymenolepididae,Hymenolepis,Hymenolepis nana,run1
6,7cf0383a-5007-4dbe-8f31-0c8b649a65f0,barcode01,1276538.0,Eukaryota,Ascomycota,Dothideomycetes,Capnodiales,Mycosphaerellaceae,Zymoseptoria,Zymoseptoria tritici,run1
7,1f4b2d5d-cc37-4418-a8cc-20ee0b1bff2d,barcode01,1047171,Eukaryota,Ascomycota,Dothideomycetes,Capnodiales,Mycosphaerellaceae,Zymoseptoria,Zymoseptoria tritici,run1
8,9ae4bf5f-b5c0-4369-aaa4-4e77bb44bd33,barcode01,1660073.0,Bacteria,Proteobacteria,Epsilonproteobacteria,Campylobacterales,Campylobacteraceae,Campylobacter,Campylobacter sp. RM6137,run1
9,873f3993-5807-460c-85b0-986a24f57f5b,barcode01,236234.0,Eukaryota,Ascomycota,Dothideomycetes,Botryosphaeriales,Botryosphaeriaceae,Diplodia,Diplodia corticola,run1


In [6]:
#Defines a function to make manual adjustments to the NCBI taxonomic sorting
def manually_adjust_ncbi_taxonomy(rank_select, entry_select, ranks_to_edit, new_entry):
    """Select dataframe entries to edit by a specific rank (rank_select) 
    and entry related to that rank (entry_select), 
    provide a list of ranks to edit for those selected entries (ranks_to_edit),
    replace these ranks with a new entry (new_entry)"""
    if entry_select in list(joint_taxa_df[rank_select].unique()):
        selection = (joint_taxa_df[rank_select] == entry_select)
        selected_tax_ranks = ranks_to_edit
        for ranks in selected_tax_ranks:
            joint_taxa_df[ranks][selection] = new_entry

In [7]:
manually_adjust_ncbi_taxonomy('species', 
                            'Cloning vector lambda TXF97', 
                            ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus'], 
                            'Cloning vector lambda TXF97')

manually_adjust_ncbi_taxonomy('superkingdom', 
                            'Viruses', 
                            ['phylum', 'class'], 
                            'dsDNA viruses, no RNA stage')

In [27]:
#count pivot table of dataframe with taxonomic columns: 
#rows - rank names, columns - barcode, sorted by desired class
def generate_ncbi_taxonomy_pivot(tax_df, rank, bcs, num):
    """From tax_df, generate a pivot table listing num rank counts, sorted by bcs"""
    pivot_table = tax_df.pivot_table(values='phylum', 
                                            index=rank, 
                                            columns='barcode_arrangement', 
                                            aggfunc='count', 
                                            fill_value=0, 
                                            margins=True)
    pivot_table.columns.name = None
    return pivot_table.sort_values(bcs, axis=0, ascending=False).head(n=num)

In [28]:
generate_ncbi_taxonomy_pivot(joint_taxa_df, 'family','barcode04',20)

Unnamed: 0_level_0,barcode01,barcode02,barcode03,barcode04,barcode05,All
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
All,575.0,2963.0,326.0,487.0,2194.0,6545.0
Pseudomonadaceae,330.0,1733.0,152.0,231.0,1436.0,3882.0
Poaceae,6.0,31.0,14.0,36.0,17.0,104.0
Pleosporaceae,15.0,237.0,44.0,31.0,189.0,516.0
Enterobacteriaceae,27.0,28.0,8.0,26.0,10.0,99.0
Hymenobacteraceae,0.0,42.0,4.0,15.0,37.0,98.0
Microbacteriaceae,4.0,223.0,15.0,11.0,47.0,300.0
Erwiniaceae,95.0,34.0,0.0,11.0,19.0,159.0
Sphingomonadaceae,1.0,138.0,6.0,11.0,107.0,263.0
Mycosphaerellaceae,12.0,92.0,3.0,8.0,39.0,154.0


In [24]:
#count pivot table of dataframe with taxonomic columns, includes reference genomes set across all ranks: 
#rows - rank names, columns - barcode, sorted by desired class
def generate_all_hits_pivot(rank, bcs, num, dropmaxspecies=False):
    """generate a pivot table listing num rank counts, sorted ascending by bcs"""
    blast_and_NCBI_df_list = [generate_ncbi_taxonomy_pivot(joint_taxa_df, rank, bcs, len(joint_taxa_df)), 
                    rgblast_df_joint_pivot.transpose()]
    blast_and_NCBI_hits = pd.concat(blast_and_NCBI_df_list)
    blast_and_NCBI_hits_sum = blast_and_NCBI_hits.groupby(blast_and_NCBI_hits.index).sum()
    
    max_genome = str(rgblast_df_joint_pivot_noall.ix['All'].idxmax(axis=1))
    
    if dropmaxspecies == True:
        blast_and_NCBI_hits_sum.loc['All'] = blast_and_NCBI_hits_sum.loc['All'] - blast_and_NCBI_hits_sum.loc[max_genome]
        blast_and_NCBI_hits_sum.drop(max_genome, axis=0, inplace=True)
    
    return blast_and_NCBI_hits_sum.sort_values(bcs, axis=0, ascending=False).head(n=num)