In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from ete3 import NCBITaxa 
ncbi = NCBITaxa()

In [2]:
#Function to generate taxonomy columns based on NCBITaxa results for NCBI hit dataframe, 
def search_rank_output_name_append_column(df, staxid_column, rank_search):
    """Input df, staxid_column from same df and rank_search (a desired taxonomic rank 
    from each staxid's lineage), outputs taxonomic name corresponding to rank_search or 'Unclassified' if
    unavailable and appends to df row by row"""
    rank_list = []
    for read_index in range(0, len(staxid_column)):
        taxid = ''
        if ';' in str(staxid_column[read_index]):
            taxid = staxid_column[read_index].split(';')[0]
        else:
            taxid = staxid_column[read_index]
        
        taxid_lineage = ''
        taxid_lineage = ncbi.get_lineage(taxid)
        
        names = ''
        names = ncbi.get_taxid_translator(taxid_lineage)
        
        ranks = ''
        ranks = ncbi.get_rank(taxid_lineage) #Dict
        
        ranks2names = ''
        ranks2names = {ranks[k]:names[k] for k in names.keys() & ranks}
        
        if rank_search in ranks2names.keys():
            rank_list.append(ranks2names[rank_search])#if rank in dict, print name
        else:
            rank_list.append('Unclassified')
    df[rank_search] = rank_list
# NOTE: Appending is always slow, try and find a better way e.g df.apply to a column based on staxids column

In [3]:
basedir = '/home/yiheng/MinION_data/mock_com/illumina'
name = 'MC1'
db = 'nt'
blast_file_name = '%s_%s.blastoutput' % (name, db)
coverage_filename = '%s_average_coverage.tab' % name
kraken_filename = '%s.%skraken_output' % (name, db)
blast_output = os.path.join(basedir, name, blast_file_name)
cov_file = os.path.join(basedir, name, coverage_filename)
kraken_output = os.path.join(basedir, name, kraken_filename)

In [4]:
blastoupt_df = pd.read_csv(blast_output, header=None, sep='\t')
blast_header = ['qseqid', 'sseqid', 'evalue', 'bitscore', 'length', 'pident', 'nident', 'sgi', 'sacc', 'staxids', 'sscinames', 'scomnames', 'sskingdoms', 'sstart', 'send']
blastoupt_df.columns = blast_header

In [5]:
cov_df = pd.read_csv(cov_file, header=None, sep='\t')
cov_header = ['contig', 'len', 'total_cov', 'ave_cov']
cov_df.columns = cov_header

In [6]:
krakenoupt_df = pd.read_csv(kraken_output, header=None, sep='\t')
kraken_header = ['classification', 'contig', 'taxid', 'seqlen', 'kmer_profile']
krakenoupt_df.columns = kraken_header

In [7]:
krakenoupt_df_classified = krakenoupt_df[(krakenoupt_df.classification == 'C') & (krakenoupt_df.taxid != 0)]
krakenoupt_df_classified = krakenoupt_df_classified.reset_index(drop=True)

In [8]:
# This is a bug from kraken2. Some classified reads also return taxid as 0 even they have a kmer profile.
# Although it has been fixed at April 2020, but our analysis was done at earlier so it still presents.
# But there are only 145/>300k reads so we just consider them as unclassified reads.
krakenoupt_df[(krakenoupt_df.classification == 'C') & (krakenoupt_df.taxid == 0)].head()

Unnamed: 0,classification,contig,taxid,seqlen,kmer_profile
1215,C,contig-100_1215,0,19927,0:22 5144:5 0:2493 192523:2 0:477 716545:1 0:2...
1552,C,contig-100_1552,0,16698,0:850 9205:1 0:873 55489:1 0:4 590646:5 0:89 4...
1756,C,contig-100_1756,0,15100,0:654 29556:3 0:169 95485:3 0:689 86661:2 0:81...
3948,C,contig-100_3948,0,8114,0:600 984487:2 0:745 379508:1 0:443 113653:2 0...
10290,C,contig-100_10290,0,3873,0:20 0:1104 48498:1 0:504 45067:2 0:502 250716...


In [9]:
krakenoupt_df_unclassified = krakenoupt_df[krakenoupt_df.taxid == 0]
krakenoupt_df_unclassified['superkingdom'] = 'Unclassified'
krakenoupt_df_unclassified['phylum'] = 'Unclassified'
krakenoupt_df_unclassified['class'] = 'Unclassified'
krakenoupt_df_unclassified['order'] = 'Unclassified'
krakenoupt_df_unclassified['family'] = 'Unclassified'
krakenoupt_df_unclassified['genus'] = 'Unclassified'
krakenoupt_df_unclassified['species'] = 'Unclassified'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [10]:
rank_list = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for rank in rank_list:
    
    search_rank_output_name_append_column(krakenoupt_df_classified, 
                                        krakenoupt_df_classified.taxid, 
                                        rank)



In [11]:
kraken_df = pd.concat([krakenoupt_df_classified, krakenoupt_df_unclassified], ignore_index=True)
final_df_kraken_draft = pd.merge(cov_df, kraken_df, how='outer', left_on='contig', right_on='contig')
final_df_kraken = final_df_kraken_draft[~final_df_kraken_draft.contig.isna()]
final_df_kraken = final_df_kraken.reset_index(drop=True)
final_df_kraken.to_csv(os.path.join(basedir, name, 'finaldf.%s.%s.k2.tab' % (name, db)), sep='\t')

In [12]:
# Now is to deal with the blast output files
# This is the code to deal with the three new species in the blast fungal database
blastoupt_new_species = blastoupt_df[blastoupt_df.staxids == 0]
magnus_df = blastoupt_new_species[blastoupt_new_species.sseqid.str.contains('cryptococcus_magnus')]
rugosa_df = blastoupt_new_species[blastoupt_new_species.sseqid.str.contains('candida_rugosa')]
mesorugosa_df = blastoupt_new_species[blastoupt_new_species.sseqid.str.contains('candida_mesorugosa')]

In [13]:
# I found these hits apart from the above three species that returns taxid of 0.
# This species does not have record in the taxonomic database but has accession in Refseq fungal database. 
# I have checked all 85 accessions and they are all from Candida auris

c_auris = blastoupt_new_species[(~blastoupt_new_species.sseqid.str.contains('cryptococcus_magnus')) & (~blastoupt_new_species.sseqid.str.contains('candida_rugosa')) & 
                     (~blastoupt_new_species.sseqid.str.contains('candida_mesorugosa'))]

In [14]:
# Now assign taxonomic rank separately for these three species.
new_cols = ['superkingdom', 'phylum', 'order', 'class', 'family', 'genus']
new_vals = ['Eukaryota', 'Ascomycota', 'Saccharomycetales', 'Saccharomycetes', 'Debaryomycetaceae', 'Candida']
rugosa_df_updated = rugosa_df.reindex(columns=rugosa_df.columns.tolist() + new_cols)   # add empty cols
rugosa_df_updated[new_cols] = new_vals
rugosa_df_updated['species'] = 'Candida rugosa'
mesorugosa_df_updated = mesorugosa_df.reindex(columns=mesorugosa_df.columns.tolist() + new_cols)   # add empty cols
mesorugosa_df_updated[new_cols] = new_vals
mesorugosa_df_updated['species'] = 'Candida mesorugosa'
c_auris_updated = c_auris.reindex(columns=c_auris.columns.tolist() + new_cols)   # add empty cols
c_auris_updated[new_cols] = new_vals
c_auris_updated['species'] = 'Candida auris'

magnus_vals = ['Eukaryota', 'Basidiomycota', 'Tremellomycetes', 'Tremellales', 'Cryptococcaceae', 'Cryptococcus']
magnus_df_updated = magnus_df.reindex(columns=magnus_df.columns.tolist() + new_cols)   # add empty cols
magnus_df_updated[new_cols] = magnus_vals
magnus_df_updated['species'] = 'Cryptococcus magnus'

In [15]:
blastoupt_new_species_taxa_added = rugosa_df_updated.append([mesorugosa_df_updated, c_auris_updated, magnus_df_updated])
blastoupt_nonew_species = blastoupt_df[blastoupt_df.staxids != 0]
blastoupt_nonew_species = blastoupt_nonew_species.reset_index(drop=True)

In [16]:
for read_index in range(0, len(blastoupt_nonew_species.staxids)):
    if ';' in str(blastoupt_nonew_species.staxids[read_index]):
        taxid = blastoupt_nonew_species.staxids[read_index].split(';')[0]
        blastoupt_nonew_species.staxids[read_index] = taxid
    else:
        continue
        
blastoupt_nonew_species.staxids = [int(x) for x in blastoupt_nonew_species.staxids]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [17]:
# pretty slow
rank_list = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for rank in rank_list:
    
    search_rank_output_name_append_column(blastoupt_nonew_species, 
                                        blastoupt_nonew_species.staxids, 
                                        rank)



In [18]:
blastoupt_added_taxa = blastoupt_nonew_species.append(blastoupt_new_species_taxa_added)
blastoupt_added_taxa.columns

Index(['qseqid', 'sseqid', 'evalue', 'bitscore', 'length', 'pident', 'nident',
       'sgi', 'sacc', 'staxids', 'sscinames', 'scomnames', 'sskingdoms',
       'sstart', 'send', 'superkingdom', 'phylum', 'class', 'order', 'family',
       'genus', 'species'],
      dtype='object')

In [19]:
blastoupt_added_taxa_trim = blastoupt_added_taxa.drop(columns=['sseqid', 'bitscore', 'length', 'sgi', 'sacc', 'sscinames', 'scomnames', 'sskingdoms',
                                                       'sstart', 'send'])

In [20]:
final_blast_df_draft = pd.merge(cov_df, blastoupt_added_taxa_trim, how='outer', left_on='contig', right_on='qseqid')

In [21]:
# some contigs do not have any reads mapped to but has blast hits
# some contigs do have reads mapped to but do not have blast hits
# some contigs do not have any reads mapped to and do not have any blast hits
# Those contigs that do not have any read mapped to are unreal contigs even they have blast hits,
# therefore, their coverage is 0 anyway so when calculate the recall and precision rate they are not contributing.
final_blast_df_draft.head()

Unnamed: 0,contig,len,total_cov,ave_cov,qseqid,evalue,pident,nident,staxids,superkingdom,phylum,class,order,family,genus,species
0,contig-100_0,410484.0,9587031.0,23.355432,contig-100_0,0.0,99.512,16330.0,5478,Eukaryota,Ascomycota,Saccharomycetes,Saccharomycetales,Saccharomycetaceae,Nakaseomyces,[Candida] glabrata
1,contig-100_1,402572.0,9947060.0,24.708773,contig-100_1,0.0,99.122,38740.0,5478,Eukaryota,Ascomycota,Saccharomycetes,Saccharomycetales,Saccharomycetaceae,Nakaseomyces,[Candida] glabrata
2,contig-100_10,240295.0,3498570.0,14.559479,contig-100_10,0.0,75.728,2159.0,4922,Eukaryota,Ascomycota,Saccharomycetes,Saccharomycetales,Phaffomycetaceae,Komagataella,Komagataella pastoris
3,contig-100_100,109312.0,3539439.0,32.379236,contig-100_100,0.0,81.416,6449.0,1136231,Eukaryota,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Candida,Candida orthopsilosis
4,contig-100_1000,23093.0,720179.0,31.18603,contig-100_1000,0.0,99.865,2954.0,294746,Eukaryota,Ascomycota,Saccharomycetes,Saccharomycetales,Debaryomycetaceae,Meyerozyma,Meyerozyma guilliermondii


In [22]:
final_blast_df = final_blast_df_draft[~final_blast_df_draft.contig.isna()]
final_blast_df = final_blast_df.reset_index(drop=True)
final_blast_df['pmatch'] = (final_blast_df.nident/final_blast_df.len)*100
final_blast_df = final_blast_df.drop(columns=['qseqid', 'nident'])
final_blast_df.to_csv(os.path.join(basedir, 'finaldf.%s.%s.blast.tab' % (name, db)), sep='\t')