In [1]:
import os
import qiime2 as q2
from qiime2.plugins import rescript, feature_table as ft
import pandas as pd
from glob import glob
import timeit
from rescript.cross_validate import _check_time

pd.set_option('max_colwidth', 0)


# Define inputs

Modify the filepaths here to your local filepaths to run this notebook.
The filepaths below merely point to the sequence and taxonomy files for the greengenes 13_8 release, clustered at different OTU % similarities.

Greengenes files can be downloaded and unzipped from here:

ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz

**DO NOT MODIFY ANY OTHER CELLS IN THIS NOTEBOOK IF YOU ARE ATTEMPTING TO REPLICATE THESE RESULTS**

In [2]:
# N jobs to use
n_jobs = 4

# location of reference database files
dbdir = '../../../ref_dbs/gg_13_8_otus/'
dbs = {
    'gg_{0}'.format(n): {
        'seqs': q2.Artifact.import_data(
            'FeatureData[Sequence]', dbdir + 'rep_set/{0}_otus.fasta'.format(n)),
        'taxa': q2.Artifact.import_data(
            'FeatureData[Taxonomy]', dbdir + 'taxonomy/{0}_otu_taxonomy.txt'.format(n),
            view_type='HeaderlessTSVTaxonomyFormat')}
    for n in ['64', '79', '88', '91', '94', '97', '99']}


# Perform CV Classification
Both k-fold and "perfect classifiers" (no CV, leaky data, best possible performance)

In [3]:
# Cross-validation
times_cv = {}
for n, v in dbs.items():
    s, t = v['seqs'], v['taxa']
    print(n)
    exp_taxa_fp = '../data/expected/{0}_expected_taxonomy.qza'.format(n)
    if not os.path.exists(exp_taxa_fp):
        start = timeit.default_timer()
        exp, obs, = rescript.actions.evaluate_cross_validate(s, t, n_jobs=n_jobs)
        times_cv[n] = timeit.default_timer() - start
        exp.save(exp_taxa_fp)
        obs.save('../data/observed/{0}_observed_taxonomy.qza'.format(n))

gg_64
gg_79
gg_88
gg_91
gg_94
gg_97
gg_99
Validation: 14.00s




Fold 0 split: 26.52s




Fold 0 fit: 913.69s
Fold 0 classify: 1975.79s
Fold 1 split: 22.64s




Fold 1 fit: 861.96s
Fold 1 classify: 1925.18s
Fold 2 split: 21.87s




Fold 2 fit: 830.71s
Fold 2 classify: 1867.59s
Total Runtime: 8461.67s


In [8]:
# "Perfect" classifier, no cross-validation
times_nocv = {}
for n, v in dbs.items():
    s, t = v['seqs'], v['taxa']
    print(n)
    obs_taxa_fp = '../data/observed/{0}_observed_taxonomy_noCV.qza'.format(n)
    exp_taxa_fp = '../data/expected/{0}_expected_taxonomy_noCV.qza'.format(n)
    if not os.path.exists(exp_taxa_fp):
        start = timeit.default_timer()
        _, _, obs, = rescript.actions.evaluate_fit_classifier(s, t, n_jobs=n_jobs)
        times_nocv[n] = timeit.default_timer() - start
        obs.save(obs_taxa_fp)
        t.save(exp_taxa_fp)


gg_64
gg_79
gg_88
gg_91
gg_94
gg_97
Validation: 6.98s




Training: 564.78s
Classification: 2491.23s




Evaluation: 3.09s
Total Runtime: 3066.08s
gg_99
Validation: 13.18s




Training: 1446.73s
Classification: 5906.85s




Evaluation: 7.11s
Total Runtime: 7373.87s


In [6]:
exp_taxonomies = {os.path.splitext(os.path.basename(fp))[0].replace('_expected_taxonomy', ''): 
                  q2.Artifact.load(fp) for fp in glob('../data/expected/*')}
obs_taxonomies = {os.path.splitext(os.path.basename(fp))[0].replace('_observed_taxonomy', ''): 
                  q2.Artifact.load(fp) for fp in glob('../data/observed/*')}


# Evaluate Classification Accuracy

In [11]:
class_eval, = rescript.actions.evaluate_classifications(
    expected_taxonomies = [exp_taxonomies[i] for i in sorted(exp_taxonomies.keys())],
    observed_taxonomies = [obs_taxonomies[i] for i in sorted(obs_taxonomies.keys())],
    labels = sorted(obs_taxonomies.keys()))
class_eval.save('../data/eval/gg_otu_cluster_classification_eval.qzv')

'../data/eval/gg_otu_cluster_eval.qzv'

# Evaluate Taxonomic Information

In [18]:
tax_eval, = rescript.actions.evaluate_taxonomy(
    taxonomies = [dbs[i]['taxa'] for i in sorted(dbs.keys())],
    labels = sorted(dbs.keys()))
tax_eval.save('../data/eval/gg_otu_cluster_taxonomic_eval.qzv')
tax_eval, = rescript.actions.evaluate_taxonomy(
    taxonomies = [dbs[i]['taxa'] for i in sorted(dbs.keys())],
    labels = sorted(dbs.keys()),
    rank_handle_regex="^[dkpcofgs]__")
tax_eval.save('../data/eval/gg_otu_cluster_taxonomic_eval_dropemptyranks.qzv')

'../data/eval/gg_otu_cluster_taxonomic_eval_dropemptyranks.qzv'