# SILVA NR99 - ECV & EFC

In [1]:
# impot basic functionality
from os import getcwd, listdir, chdir, mkdir, path
import glob

In [2]:
mwd = '/home/mrobeson/projects/rescript_benchmarks'
refdb = mwd + '/ref_dbs/silva-138'
bench = mwd + '/benchmarks/silva-138'

In [3]:
chdir(bench)
getcwd()

'/home/mrobeson/projects/rescript_benchmarks/benchmarks/silva-138'

#### Base ecv qsub string

In [4]:
ecv_pbs_str = """#!/bin/bash
#PBS -l nodes=1:ppn={ppn},mem={mem},walltime={wt}
#PBS -l feature=xeon
#PBS -N {job_name}
#PBS -o {base_out_path}/{job_name}.out
#PBS -e {base_out_path}/{job_name}.err

export LC_ALL=en_US.utf-8
export LANG=en_US.utf-8

source activate qiime2-2020.6

date

cd {base_out_path}

qiime rescript evaluate-cross-validate \
    --i-sequences {base_in_path}/{inseq} \
    --i-taxonomy  {base_in_path}/{intax} \
    --p-n-jobs {ppn} \
    --o-expected-taxonomy {base_out_path}/{exp_tax} \
    --o-observed-taxonomy {base_out_path}/{obs_tax} \
    --o-evaluation {base_out_path}/{eval_tax}

date

source deactivate"""

#### Base efc qsub string

In [5]:
efc_pbs_str = """#!/bin/bash
#PBS -l nodes=1:ppn={ppn},mem={mem},walltime={wt}
#PBS -l feature=xeon
#PBS -N {job_name}
#PBS -o {base_out_path}/{job_name}.out
#PBS -e {base_out_path}/{job_name}.err

export LC_ALL=en_US.utf-8
export LANG=en_US.utf-8

source activate qiime2-2020.6

date

cd {base_out_path}

qiime rescript evaluate-fit-classifier \
    --i-sequences {base_in_path}/{inseq} \
    --i-taxonomy  {base_in_path}/{intax} \
    --p-n-jobs {ppn} \
    --o-classifier {base_out_path}/{classifier_out_fp} \
    --o-evaluation {base_out_path}/{eval_out_fp} \
    --o-observed-taxonomy {base_out_path}/{obs_tax_out_fp}

date

source deactivate"""

#### base evaluate taxa qsub string

In [6]:
et_pbs_str = """#!/bin/bash
#PBS -l nodes=1:ppn={ppn},mem={mem},walltime={wt}
#PBS -l feature=xeon
#PBS -N {job_name}
#PBS -o {base_out_path}/{job_name}.out
#PBS -e {base_out_path}/{job_name}.err

export LC_ALL=en_US.utf-8
export LANG=en_US.utf-8

source activate qiime2-2020.6

date

cd {base_in_path}

qiime rescript evaluate-taxonomy \
    --i-taxonomies {taxonomies} \
    --o-taxonomy-stats  {base_out_path}/{tax_stats}

date

source deactivate"""

#### base evaluate seqs qsub string

In [7]:
es_pbs_str = """#!/bin/bash
#PBS -l nodes=1:ppn={ppn},mem={mem},walltime={wt}
#PBS -l feature=xeon
#PBS -N {job_name}
#PBS -o {base_out_path}/{job_name}.out
#PBS -e {base_out_path}/{job_name}.err

export LC_ALL=en_US.utf-8
export LANG=en_US.utf-8

source activate qiime2-2020.6

date

cd {base_in_path}

qiime rescript evaluate-seqs \
    --i-sequences {sequences} \
    --p-palette 'cividis' \
    --p-subsample-kmers 0.2 \
    --p-kmer-lengths 32 16 8 \
    --o-visualization {base_out_path}/{eval_seqs}
    
date

source deactivate"""

In [8]:
# This is not a generic function. Written specifically for the output we generated earlier.
# Using global string variable `ecv_pbs_str`
def make_ecv_benchmark_pbs_files(ppn = '1',
                                 mem = '100GB',
                                 wt = '72:00:00',
                                 base_in_path = '/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default',
                                 base_out_path = '/home/mrobeson/projects/rescript_benchmarks/benchmarks/silva-138/silva-138-nr99-default-pipe',
                                 glob_seqs_list = ['*derep-seqs.qza', '*derep-seqs-515-806.qza']):

    chdir(base_in_path)
    seq_files = []
    for f in glob_seqs_list:
        seq_files.extend(glob.glob(f))

    tax_files = [sf.replace('seqs', 'taxa') for sf in seq_files]

    for s,t in zip(seq_files, tax_files):
        bn = path.splitext(t)[0]
        job_name = bn + '-ecv'
        ecv_str = ecv_pbs_str.format(ppn = ppn,
                   mem = mem,
                   wt = wt,
                   job_name = job_name,
                   base_in_path = base_in_path,
                   base_out_path = base_out_path,
                   inseq = s,
                   intax = t,
                   exp_tax = bn + '-ecv-exptax.qza',
                   obs_tax = bn + '-ecv-obstax.qza',
                   eval_tax = bn + '-ecv-evaltax.qzv')
    
        job_file_name = job_name + '.pbs'    
    
        with open(path.join(base_out_path, job_file_name), 'w') as outfile:
            outfile.write(ecv_str)    

In [9]:
# This is not a generic function. Written specifically for the output we generated earlier.
# Using global string variable `efc_pbs_str`
def make_efc_benchmark_pbs_files(ppn = '1',
                        mem = '100GB',
                        wt = '72:00:00',
                        base_in_path = '/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default',
                        base_out_path = '/home/mrobeson/projects/rescript_benchmarks/benchmarks/silva-138/silva-138-nr99-default-pipe',
                        glob_seqs_list = ['*derep-seqs.qza', '*derep-seqs-515-806.qza']):
    
    chdir(base_in_path)

    seq_files = []
    for f in glob_seqs_list:
        seq_files.extend(glob.glob(f))

    tax_files = [sf.replace('seqs', 'taxa') for sf in seq_files]

    for s,t in zip(seq_files, tax_files):
        bn = path.splitext(t)[0]
        job_name = bn + '-efc'
        ecv_str = efc_pbs_str.format(ppn = ppn,
                   mem = mem,
                   wt = wt,
                   job_name = job_name,
                   base_in_path = base_in_path,
                   base_out_path = base_out_path,
                   inseq = s,
                   intax = t,
                   classifier_out_fp = bn + '-efc-classifier.qza',
                   obs_tax_out_fp = bn + '-efc-obstax.qza',
                   eval_out_fp = bn + '-efc-evaltax.qzv')
    
        job_file_name = job_name + '.pbs'    
    
        with open(path.join(base_out_path, job_file_name), 'w') as outfile:
            outfile.write(ecv_str)    
    

## NR99 DB - default - no euks - processing

In [10]:
chdir(refdb + '/silva-138-nr99-default-noeuks')
getcwd()

'/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default-noeuks'

In [26]:
make_ecv_benchmark_pbs_files(base_in_path ='/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default-noeuks',
                             base_out_path = '/home/mrobeson/projects/rescript_benchmarks/benchmarks/silva-138/silva-138-nr99-default-noeuks-pipe')

In [27]:
make_efc_benchmark_pbs_files(base_in_path ='/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default-noeuks',
                             base_out_path = '/home/mrobeson/projects/rescript_benchmarks/benchmarks/silva-138/silva-138-nr99-default-noeuks-pipe')

## NR99 DB - default - no euks - good labels only

In [10]:
chdir(refdb + '/silva-138-nr99-default-noeuks')
getcwd()

'/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default-noeuks'

In [13]:
make_ecv_benchmark_pbs_files(base_in_path ='/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default-noeuks',
                             base_out_path = '/home/mrobeson/projects/rescript_benchmarks/benchmarks/silva-138/silva-138-nr99-default-noeuks-gl-pipe',
                            glob_seqs_list = ['*derep-seqs-gl.qza', '*derep-seqs-515-806-gl.qza'])

In [14]:
make_efc_benchmark_pbs_files(base_in_path ='/home/mrobeson/projects/rescript_benchmarks/ref_dbs/silva-138/silva-138-nr99-default-noeuks',
                             base_out_path = '/home/mrobeson/projects/rescript_benchmarks/benchmarks/silva-138/silva-138-nr99-default-noeuks-gl-pipe',
                            glob_seqs_list = ['*derep-seqs-gl.qza', '*derep-seqs-515-806-gl.qza'])