In [1]:
# modules required for handling dataframes
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from ete3 import NCBITaxa 
ncbi = NCBITaxa()

In [2]:
#Function to generate taxonomy columns based on NCBITaxa results for NCBI hit dataframe, 
#Function to generate taxonomy columns based on NCBITaxa results for NCBI hit dataframe, 
def search_rank_output_name_append_column(df, staxid_column, rank_search):
    """Input df, staxid_column from same df and rank_search (a desired taxonomic rank 
    from each staxid's lineage), outputs taxonomic name corresponding to rank_search or 'Unclassified' if
    unavailable and appends to df row by row"""
    rank_list = []
    for read_index in range(0, len(staxid_column)):
        taxid = ''
        if ';' in str(staxid_column[read_index]):
            taxid = staxid_column[read_index].split(';')[0]
        else:
            taxid = staxid_column[read_index]
        
        taxid_lineage = ''
        taxid_lineage = ncbi.get_lineage(taxid)
        
        names = ''
        names = ncbi.get_taxid_translator(taxid_lineage)
        
        ranks = ''
        ranks = ncbi.get_rank(taxid_lineage) #Dict
        
        ranks2names = ''
        ranks2names = {ranks[k]:names[k] for k in names.keys() & ranks}
        
        if rank_search in ranks2names.keys():
            rank_list.append(ranks2names[rank_search])#if rank in dict, print name
        else:
            rank_list.append('Unclassified')
    df[rank_search] = rank_list
# NOTE: Appending is always slow, try and find a better way e.g df.apply to a column based on staxids column


def add_df_headers(df, header_list):
    df.columns = header_list

def trim_df_columns(df,column_list):
    df.drop(columns=column_list, inplace=True)

In [3]:
# put in all input parameters. Here I am showing the code for one sample as an example.
# to generate the final_df for other samples, simply change the basedir and barcode, as all file names just has this two difference between each two samples.
# please note that there are other places of this script that require understanding and hard coding skills which I also commented below.

sourcedir = '/home/yiheng/MinION_data' # the directory where all the documents of each sequencing run are stored.
barcode = '06' # the barcode for each sample, corresponding to the sample name.
sample = 'MC1'
basedir = os.path.join(sourcedir, 'barcode%s' % barcode)
db = "nt" # database used

In [4]:
seq_sum_dir = os.path.join(basedir, 'sequencing_summary_barcode%s.txt' % barcode) # the directory of sequencing summary file for each run
blastoutput_dir = os.path.join(basedir, '%s_read_%s.blastoutput' % (sample, db)) # the directory for .blast_output file
kraken2output_dir = os.path.join(basedir, '%s_read.%skraken_output' % (sample, db)) # the directory for .kraken2_output file

In [5]:
seq_sum_df = pd.read_csv(seq_sum_dir, sep='\t')
seq_sum_df_pass = seq_sum_df[seq_sum_df.passes_filtering==True]

In [6]:
blastoutput_df = pd.read_csv(blastoutput_dir, header=None, sep='\t')
kraken2output_df = pd.read_csv(kraken2output_dir, header=None, sep='\t')

In [7]:
blast_header = ['qseqid', 'sseqid', 'evalue', 'bitscore', 'length', 'pident', 'nident', 'sgi', 'sacc', 'staxids', 'sscinames', 'scomnames', 'sskingdoms', 'sstart', 'send']
add_df_headers(blastoutput_df, blast_header)

In [8]:
kraken2_header = ["classification", "read_id", "taxid", "seq_length", "kmer_profile"]
add_df_headers(kraken2output_df, kraken2_header)

In [9]:
blastoutput_df['log_evalue'] = -np.log(blastoutput_df.evalue)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
blast_header_drop = ['sseqid', 'bitscore', 'length', 'evalue', 'sgi', 'sacc', 'sskingdoms', 'sstart', 'send', 'scomnames']
seq_sum_df_header_drop = ['filename', 'run_id', 'channel', 'mux', 'start_time', 'duration', 'num_events', 'passes_filtering', 'template_start',
                          'num_events_template', 'template_duration','mean_qscore_template', 'strand_score_template', 'median_template','mad_template']
trim_df_columns(blastoutput_df, blast_header_drop)
trim_df_columns(seq_sum_df_pass, seq_sum_df_header_drop)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [11]:
# modify some of the na
blast_final_df = pd.merge(seq_sum_df_pass, blastoutput_df, how="outer", left_on='read_id', right_on='qseqid')
blast_final_df.dropna(inplace=True)
blast_final_df.drop(columns='read_id',inplace=True)
blast_final_df = blast_final_df.reset_index(drop=True)

In [12]:
# I mannually calculated the query coverage and labelled it as pmatch.
blast_final_df['pmatch'] = blast_final_df.nident/blast_final_df.sequence_length_template*100

In [13]:
# This step is due to sometimes the taxonomic database in ete3 module were not the same version as the local blast database. In this case some of the taxids can not be recognized.
# So I just mannually search them and assigned the taxonomic information to them.
blast_final_df_1 = blast_final_df[~blast_final_df.staxids.str.contains('1718871')]
blast_final_df_1 = blast_final_df_1.reset_index(drop=True)

blast_final_df_2 = blast_final_df[blast_final_df.staxids.str.contains('1718871')]
blast_final_df_2['superkingdom'] = 'Eukaryota'
blast_final_df_2['phylum'] = 'Ascomycota'
blast_final_df_2['class'] = 'Eurotiomycetes'
blast_final_df_2['order'] = 'Eurotiales'
blast_final_df_2['family'] = 'Aspergillaceae'
blast_final_df_2['genus'] = 'Aspergillus'
blast_final_df_2['species'] = 'Aspergillus sp.'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http

In [14]:
#ntblasthit_reads_filtered_barcodes_added_TaxaRank = ntblasthit_reads_filtered_barcodes.copy()
# pretty slow
rank_list = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for rank in rank_list:
    search_rank_output_name_append_column(blast_final_df_1, 
                                          blast_final_df_1.staxids, 
                                          rank)

In [15]:
blast_final_df = pd.concat([blast_final_df_1, blast_final_df_2], ignore_index=True)

In [16]:
blast_final_df.columns = ['%s_blast' % x for x in blast_final_df.columns]

In [17]:
blast_final_df.to_csv(os.path.join(basedir, 'barcode%s.%sdb_blast.tab' % (barcode, db)), sep='\t')

In [18]:
# Now is for the kraken2 output files
# Here I selected reads that were overlapped with blast outputfile (not matter classified or not) and the classified reads together, in order to have a better comparison with the blast results.
kraken2output_df = kraken2output_df[(kraken2output_df.read_id.isin(blast_final_df.qseqid_blast)) | 
                                    (kraken2output_df.classification=='C')]

In [19]:
final_df_kraken_classified = kraken2output_df[kraken2output_df.taxid!=0]
final_df_kraken_unclassified = kraken2output_df[kraken2output_df.taxid==0]

In [20]:
final_df_kraken_classified = final_df_kraken_classified.reset_index(drop=True)

In [21]:
final_df_kraken_unclassified['superkingdom'] = 'Unclassified'
final_df_kraken_unclassified['phylum'] = 'Unclassified'
final_df_kraken_unclassified['class'] = 'Unclassified'
final_df_kraken_unclassified['order'] = 'Unclassified'
final_df_kraken_unclassified['family'] = 'Unclassified'
final_df_kraken_unclassified['genus'] = 'Unclassified'
final_df_kraken_unclassified['species'] = 'Unclassified'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [22]:
# Here in the output you could see a bit of incompability of the taxonomic database between ete3 module and the local blast database. Needs to be really carefull here.
# pretty slow
rank_list = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for rank in rank_list:
    search_rank_output_name_append_column(final_df_kraken_classified, 
                                          final_df_kraken_classified.taxid, 
                                          rank)



In [23]:
final_df_k2 = pd.concat([final_df_kraken_classified, final_df_kraken_unclassified], ignore_index=True)
final_df_k2.columns = ['%s_k2' % x for x in final_df_k2.columns]
final_df_k2.to_csv(os.path.join(basedir, 'barcode%s.%sdb_k2.tab' % (barcode, db)), sep='\t')