In [1]:
# modules required for handling dataframes
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
sourcedir = '/home/yiheng/MinION_data/'
basedir = '/home/yiheng/MinION_data/mock_com/illumina'
name = 'MC1'
db = 'refseq_fungi_updated'
file_name = 'finaldf.%s.%s.blast.tab' % (name, db)
finaldf_path = os.path.join(basedir, name, file_name)

In [3]:
genera_in_mock = ['Aspergillus','Blastobotrys','Candida','Diutina', 'Nakaseomyces', 'Clavispora','Cryptococcus','Cyberlindnera',
'Debaryomyces','Geotrichum','Kluyveromyces','Kodamaea','Lomentospora','Magnusiomyces','Meyerozyma','Pichia',
'Rhodotorula','Scedosporium','Trichophyton', 'Trichosporon', 'Wickerhamomyces','Yarrowia','Zygoascus', 'Purpureocillium']

In [4]:
final_df = pd.read_csv(finaldf_path, index_col=0, sep='\t')
# This number is needed for the later calculation. pmatch > 0 is to select all the classified contigs.
final_df[final_df.pmatch > 0].total_cov.sum()

3158033550.0

In [5]:
def generate_taxonomy_pivot_sum_blast(tax_df, rank, bcs, num):
    """From tax_df, generate a pivot table listing num rank counts, sorted by bcs"""
    pivot_table = tax_df.pivot_table(values='total_cov', 
                                            index=rank, 
                                            columns='superkingdom', 
                                            aggfunc='sum', 
                                            fill_value=0)
    pivot_table.columns.name = None
    pivot_table = pivot_table.sort_values(bcs, axis=0, ascending=False).head(n=num)
    return pivot_table

In [6]:
# alright, let's do the math
# Here the pmatch is the query coverage.
def calculate_precision_pmatch(blast_df, pmatch):
    subset_blast_df = blast_df[blast_df.pmatch >= float(pmatch)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    precision = float((subset_fungidb_blast_pivot.sum().sum())/fungidb_blast_pivot['Eukaryota'].sum()*100)
    return precision

def calculate_precision_pident(blast_df, pident):
    subset_blast_df = blast_df[blast_df.pident >= float(pident)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    precision = float((subset_fungidb_blast_pivot.sum().sum())/fungidb_blast_pivot['Eukaryota'].sum()*100)
    return precision

def calculate_precision_length(blast_df, length):
    subset_blast_df = blast_df[blast_df.len >= float(length)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    precision = float((subset_fungidb_blast_pivot.sum().sum())/fungidb_blast_pivot['Eukaryota'].sum()*100)
    return precision

def calculate_precision_evalue(blast_df, evalue):
    subset_blast_df = blast_df[blast_df.log_evalue_blast >= int(evalue)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    precision = float((subset_fungidb_blast_pivot.sum().sum())/fungidb_blast_pivot['Eukaryota'].sum()*100)
    return precision

def calculate_completeness_pmatch(blast_df, pmatch):
    subset_blast_df = blast_df[blast_df.pmatch >= float(pmatch)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    completeness = len(subset_fungidb_blast_pivot)/24*100
    return completeness

def calculate_completeness_pident(blast_df, pident):
    subset_blast_df = blast_df[blast_df.pident >= float(pident)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    completeness = len(subset_fungidb_blast_pivot)/24*100
    return completeness

def calculate_completeness_length(blast_df, length):
    subset_blast_df = blast_df[blast_df.len >= float(length)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    completeness = len(subset_fungidb_blast_pivot)/24*100
    return completeness

def calculate_completeness_evalue(blast_df, evalue):
    subset_blast_df = blast_df[blast_df.log_evalue_blast >= int(evalue)]
    fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 2000) 
    subset_fungidb_blast_pivot = fungidb_blast_pivot[fungidb_blast_pivot.index.isin(genera_in_mock)]
    completeness = len(subset_fungidb_blast_pivot)/24*100
    return completeness

In [7]:
def calculate_remaining_length(blast_df, length):
    if name == 'MC1':
        subset_blast_df = blast_df[blast_df.len >= float(length)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum().sum()/3158033550*100)
        return remaining_rate
    elif name == 'IE1':
        subset_blast_df = blast_df[blast_df.len >= float(length)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum().sum()/3363783952*100)
        return remaining_rate

def calculate_remaining_evalue(blast_df, evalue):
    if name == 'MC1':
        subset_blast_df = blast_df[blast_df.log_evalue_blast >= int(evalue)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum()/3158033550*100)
        return remaining_rate
    elif name == 'IE1':
        subset_blast_df = blast_df[blast_df.log_evalue_blast >= int(evalue)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum()/3363783952*100)
        return remaining_rate

def calculate_remaining_pmatch(blast_df, pmatch):
    if name == 'MC1':
        subset_blast_df = blast_df[blast_df.pmatch >= int(pmatch)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum()/3158033550*100)
        return remaining_rate
    elif name == 'IE1':
        subset_blast_df = blast_df[blast_df.pmatch >= int(pmatch)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum()/3363783952*100)
        return remaining_rate
    
def calculate_remaining_pident(blast_df, pident):
    if name == 'MC1':
        subset_blast_df = blast_df[blast_df.pident >= int(pident)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum()/3158033550*100)
        return remaining_rate
    elif name == 'IE1':
        subset_blast_df = blast_df[blast_df.pident >= int(pident)]
        fungidb_blast_pivot = generate_taxonomy_pivot_sum_blast(subset_blast_df, 'genus', 'Eukaryota', 20000) 
        remaining_rate = float(fungidb_blast_pivot.sum()/3363783952*100)
        return remaining_rate
    

In [8]:
fillna_cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
final_df[fillna_cols] = final_df[fillna_cols].fillna(value='Unclassified')

In [9]:
pident_x_precision = pd.DataFrame()
pident_x_precision['pident'] = range(0, 101)
pident_x_precision['precision_rate'] = np.nan

for y in pident_x_precision['pident']:
    pident_x_precision.iloc[pident_x_precision[pident_x_precision.pident==y].index, 
                            pident_x_precision.columns.get_loc('precision_rate')] = calculate_precision_pident(final_df, y)
    
pident_x_precision.to_csv(os.path.join(basedir, name, '%s.%s.pident_precision.tab' % (name, db)), sep='\t')

In [10]:
pident_x_completeness = pd.DataFrame()
pident_x_completeness['pident'] = np.arange(0, 101)
pident_x_completeness['completeness'] = np.nan

for w in range(0, 101):
    pident_x_completeness.iloc[w, pident_x_completeness.columns.get_loc('completeness')] = calculate_completeness_pident(final_df, w)
    
pident_x_completeness.to_csv(os.path.join(basedir, name, '%s.%s.pident_completeness.tab' % (name, db)), sep='\t')

In [11]:
pmatch_X_precision = pd.DataFrame()
pmatch_X_precision['pmatch'] = range(0, 98)
pmatch_X_precision['precision_rate'] = np.nan

for y in pmatch_X_precision['pmatch']:
    pmatch_X_precision.iloc[pmatch_X_precision[pmatch_X_precision.pmatch==y].index, 
                            pmatch_X_precision.columns.get_loc('precision_rate')] = calculate_precision_pmatch(final_df, y)

pmatch_X_precision.to_csv(os.path.join(basedir, name, '%s.%s.pmatch_precision.tab' % (name, db)), sep='\t')

In [12]:
pmatch_X_completeness = pd.DataFrame()
pmatch_X_completeness['pmatch'] = np.arange(0, 98)
pmatch_X_completeness['completeness'] = np.nan

for w in range(0, 98):
    pmatch_X_completeness.iloc[w, pmatch_X_completeness.columns.get_loc('completeness')] = calculate_completeness_pmatch(final_df, w)

pmatch_X_completeness.to_csv(os.path.join(basedir, name, '%s.%s.pmatch_completeness.tab' % (name, db)), sep='\t')

In [13]:
# This is to provide range for applying cutoffs
final_df.total_cov.sort_values(ascending=True)

271246         30.0
269242         31.0
271780         32.0
270004         33.0
270423         35.0
            ...    
222222    7836147.0
159003    8008326.0
294559    9278669.0
0         9587031.0
1         9947060.0
Name: total_cov, Length: 344780, dtype: float64

In [14]:
length_x_precision = pd.DataFrame()
length_x_precision['length'] = np.arange(0, 200000, 50)
length_x_precision['precision_rate'] = np.nan

for length in length_x_precision['length']:
    length_x_precision.iloc[int(length_x_precision[length_x_precision['length']==length].index[0]),
                            length_x_precision.columns.get_loc('precision_rate')] = calculate_precision_length(final_df, length)

length_x_precision.to_csv(os.path.join(basedir, name, '%s.%s.length_precision.tab' % (name, db)), sep='\t')

In [15]:
length_x_completeness = pd.DataFrame()
length_x_completeness['length'] = np.arange(0,200000,50)
length_x_completeness['completeness'] = np.nan

for length in length_x_completeness['length']:
    length_x_completeness.iloc[int(length_x_completeness[length_x_completeness['length']==length].index[0]), 
                               length_x_completeness.columns.get_loc('completeness')] = calculate_completeness_length(final_df, length)

length_x_completeness.to_csv(os.path.join(basedir,name, '%s.%s.length_completeness.tab' % (name, db)), sep='\t')

In [16]:
final_df['log_evalue_blast'] = -np.log(final_df.evalue)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [17]:
evalue_x_precision = pd.DataFrame()
evalue_x_precision['evalue'] = range(0, 414)
evalue_x_precision['precision_rate'] = np.nan

for x in range(0, 414):
    evalue_x_precision.iloc[x, evalue_x_precision.columns.get_loc('precision_rate')] = calculate_precision_evalue(final_df, x)
    
evalue_x_precision.to_csv(os.path.join(basedir, '%s.%s.evalue_precision.tab' % (name, db)), sep='\t')

In [18]:
evalue_x_completeness = pd.DataFrame()
evalue_x_completeness['evalue'] = range(0, 414)
evalue_x_completeness['completeness'] = np.nan

for z in range(0, 414):
    evalue_x_completeness.iloc[z, evalue_x_completeness.columns.get_loc('completeness')] = calculate_completeness_evalue(final_df, z)
    
evalue_x_completeness.to_csv(os.path.join(basedir, '%s.%s.evalue_completeness.tab' % (name, db)), sep='\t')

In [19]:
pident_x_remaining = pd.DataFrame()
pident_x_remaining['pident'] = range(0, 101)
pident_x_remaining['remaining_rate'] = np.nan

for y in pident_x_remaining['pident']:
    pident_x_remaining.iloc[pident_x_remaining[pident_x_remaining.pident==y].index,
                            pident_x_remaining.columns.get_loc('remaining_rate')] = calculate_remaining_pident(final_df, y)
    
pident_x_remaining.to_csv(os.path.join(basedir, name, '%s.%s.pident_remaining.tab' % (name, db)), sep='\t')

In [20]:
pmatch_x_remaining = pd.DataFrame()
pmatch_x_remaining['pmatch'] = range(0, 100)
pmatch_x_remaining['remaining_rate'] = np.nan

for y in pmatch_x_remaining['pmatch']:
    pmatch_x_remaining.iloc[pmatch_x_remaining[pmatch_x_remaining.pmatch==y].index,
                            pmatch_x_remaining.columns.get_loc('remaining_rate')] = calculate_remaining_pmatch(final_df, y)
    
pmatch_x_remaining.to_csv(os.path.join(basedir, name, '%s.%s.pmatch_remaining.tab' % (name, db)), sep='\t')

In [22]:
length_x_remaining = pd.DataFrame()
length_x_remaining['length'] = np.arange(0, 200000, 50)
length_x_remaining['remaining_rate'] = np.nan

for length in length_x_remaining['length']:
    length_x_remaining.iloc[int(length_x_remaining[length_x_remaining['length']==length].index[0]),
                          length_x_remaining.columns.get_loc('remaining_rate')] = calculate_remaining_length(final_df, length)
    
length_x_remaining.to_csv(os.path.join(basedir, name, '%s.%s.length_remaining.tab' % (name, db)), sep='\t')

In [23]:
evalue_x_remaining = pd.DataFrame()
evalue_x_remaining['evalue'] = range(0, 416)
evalue_x_remaining['remaining_rate'] = np.nan

for y in evalue_x_remaining['evalue']:
    evalue_x_remaining.iloc[evalue_x_remaining[evalue_x_remaining.evalue==y].index,
                            evalue_x_remaining.columns.get_loc('remaining_rate')] = calculate_remaining_evalue(final_df, y)
    
evalue_x_remaining.to_csv(os.path.join(basedir, name, '%s.%s.evalue_remaining.tab' % (name, db)), sep='\t')