### STEP : Taxonomic assignment
#### Taxonomic assignment with Silva 16S

Using Silva 16S classifier for Qiime2. List with all [classifiers](https://docs.qiime2.org/2020.6/data-resources/). 

#### Versions 
- [Silva 138 99% OTUs full-length sequences](https://data.qiime2.org/2020.6/common/silva-138-99-nb-classifier.qza)
- [Silva 138 99% OTUs from 515F/806R region of sequences](https://data.qiime2.org/2020.6/common/silva-138-99-515-806-nb-classifier.qza)
- [Greengenes 13_8 99% OTUs full-length sequences](https://data.qiime2.org/2020.6/common/gg-13-8-99-nb-classifier.qza)
- [Greengenes 13_8 99% OTUs from 515F/806R region of sequences](https://data.qiime2.org/2020.6/common/gg-13-8-99-515-806-nb-classifier.qza)

#### Methods
- [qiime feature-classifier](https://docs.qiime2.org/2022.2/plugins/available/feature-classifier/)
- [qiime feature-classifier classify-sklearn](https://docs.qiime2.org/2021.8/plugins/available/feature-classifier/classify-sklearn)
- [qiime metadata](https://docs.qiime2.org/2022.2/plugins/available/metadata/)
- [classify-hybrid-vsearch-sklearn](https://docs.qiime2.org/2022.2/plugins/available/feature-classifier/classify-hybrid-vsearch-sklearn/)
- [qiime metadata tabulate](https://docs.qiime2.org/2022.2/plugins/available/metadata/tabulate/)
- [qiime taxa](https://docs.qiime2.org/2022.2/plugins/available/taxa/)
- [qiime taxa barplot](https://docs.qiime2.org/2022.2/plugins/available/taxa/barplot/)

## Setup and settings

In [1]:
# Importing packages
import os
import pandas as pd
from qiime2 import Artifact
from qiime2 import Visualization
from qiime2 import Metadata
import qiime2.plugins.metadata.actions as metadata_actions

from qiime2.plugins import feature_classifier
from qiime2.plugins import metadata
from qiime2.plugins import taxa

from qiime2.plugins.taxa.methods import collapse
from qiime2.plugins.taxa.methods import filter_table
from qiime2.plugins.feature_table.visualizers import summarize

import matplotlib.pyplot as plt

from utils import *

%matplotlib inline

In [2]:
# install scikit-learn specific version to use trained classifier
%pip install --user 'scikit-learn==0.23.1'

Note: you may need to restart the kernel to use updated packages.


In [3]:
# from qiime2.plugins import demux, deblur, quality_filter, \
#                            metadata, feature_table, alignment, \
#                            phylogeny, diversity, emperor, feature_classifier, \
#                            taxa, composition

### Receiving the parameters

The following cell can receive parameters using the [papermill](https://papermill.readthedocs.io/en/latest/) tool.

In [4]:
metadata_file = '/home/lauro/nupeb/rede-micro/redemicro-miliane-nutri/data/raw/metadata/miliane-metadata-CxAC.tsv'
base_dir = os.path.join('/', 'home', 'lauro', 'nupeb', 'rede-micro', 'redemicro-miliane-nutri')
experiment_name = 'miliane-CxAC-trim'
replace_files = False

In [5]:
experiment_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name))
img_folder = os.path.abspath(os.path.join(experiment_folder, 'imgs'))
sheet_folder = os.path.abspath(os.path.join(experiment_folder, 'sheets'))

### Defining names, paths and flags

In [6]:
# QIIME2 Artifacts folder
qiime_folder = os.path.join(experiment_folder, 'qiime-artifacts')

# Input - DADA2 Artifacts
dada2_tabs_path = os.path.join(qiime_folder, 'dada2-tabs.qza')
dada2_reps_path = os.path.join(qiime_folder, 'dada2-reps.qza')
dada2_stat_path = os.path.join(qiime_folder, 'dada2-stat.qza')

# Input - DADA2 Artifacts
classifier_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'models', 'silva-138-99-nb-classifier.qza'))
# 'silva-138-99-nb-classifier.qza'

# Output - Excel file
excel_path = os.path.join(sheet_folder, 'abundances.xlsx')

# Output - Metataxonomy Artifact
metatax_path = os.path.join(qiime_folder, 'metatax.qza')
metatax_view_path = os.path.join(qiime_folder, 'metatax.qzv')
metatax_bar_path = os.path.join(qiime_folder, 'metatax-bar.qzv')

# Flag - Load or create files
need_tax = not (os.path.isfile(metatax_path)) or replace_files
need_view = not (os.path.isfile(metatax_view_path) or os.path.isfile(metatax_bar_path)) or replace_files

## Step execution

### Load input files

This Step import the QIIME2 `SampleData[PairedEndSequencesWithQuality]` Artifact with all demultiplexed sequences and the `Metadata` file.

In [7]:
metadata_qa = Metadata.load(metadata_file)

In [8]:
if not need_tax:
    
    # Load FeatureData[Taxonomy]
    metatax = Artifact.load(metatax_path)
    
else:
    
    # Load TaxonomicClassifier Artifact
    classifier = Artifact.load(classifier_path)
    
    # Load FeatureData[Sequence] Artifact
    reps = Artifact.load(dada2_reps_path)
    
    # Classify ASV features and create a new FeatureData[Taxonomy]
    metatax_qa = feature_classifier.methods.classify_sklearn(reads=reps, classifier=classifier, n_jobs=threads)

    # Save FeatureData[Taxonomy] Artifact
    metatax_qa.classification.save(metatax_path)

# Load FeatureTable[Frequency] Artifact
tabs = Artifact.load(dada2_tabs_path)

In [9]:
if not need_view:
    
    # Load Metatax
    metatax_qv = Visualization.load(metatax_view_path)
    metatax_bar_qv = Visualization.load(metatax_bar_path)
    
else:
    
    # Metataxonomy Table Visualization
    metatax_qv = metadata.visualizers.tabulate(metatax_qa.classification.view(Metadata))
    metatax_qv.visualization.save(metatax_view_path)
    
    # Load TableData[Sequence] Artifact
    tabs = Artifact.load(dada2_tabs_path)
    
    # Barplot Visualization
    # Next, we can view the taxonomic composition of our samples with interactive bar plots. 
    # Generate those plots with the following command and then open the visualization.
    metatax_bar_qv = taxa.visualizers.barplot(tabs, metatax_qa.classification, metadata_qa)
    metatax_bar_qv.visualization.save(metatax_bar_path)

### Metataxonomy analysis

In [21]:
def filter_and_collapse(tab, tax, meta, lvl, exclude=True, exclude_list='uncultured,unidentified,metagenome'):
    from qiime2.plugins.taxa.methods import collapse
    from qiime2.plugins.taxa.methods import filter_table
    from qiime2.plugins.feature_table.visualizers import summarize
    
    to_include = ('d', 'p', 'c', 'o', 'f', 'g', 's')[lvl-1]
    to_include += '__'
    to_exclude = exclude_list if exclude else None
    
    filtered_tabs = filter_table(
        table=tab, 
        taxonomy=tax,
        include=to_include,
        exclude=to_exclude,
        mode='contains').filtered_table
    
    collapsed_table = collapse(table=filtered_tabs, taxonomy=tax, level=lvl).collapsed_table
    collapsed_table_view = summarize(table=collapsed_table, sample_metadata=meta).visualization
    
    return collapsed_table, collapsed_table_view

def make_rank(tab, top=None):
    import pandas as pd
    df = tab.view(pd.DataFrame)
    rank_series = df.sum(axis=0).sort_values(ascending=False)
    if limit:
        rank_series = rank_series[:top]
    return rank_series

def split_tax(serie):
    level_names = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    
    

In [22]:
collapsed_table, collapsed_table_view = filter_and_collapse(
    tabs, metatax, metadata_qa, 
    lvl=7, 
    exclude=False, 
    exclude_list='uncultured,unidentified,metagenome')

In [27]:
rank = make_rank(collapsed_table, top=10)
rank

d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__Muribaculaceae;s__uncultured_bacterium                                      395859.0
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotellaceae_NK3B31_group;s__uncultured_bacterium                         135875.0
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotellaceae_UCG-001;s__uncultured_bacterium                               71521.0
d__Bacteria;p__Firmicutes;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__[Eubacterium]_siraeum_group;s__uncultured_bacterium                          37478.0
d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Turicibacter;s__uncultured_bacterium                                     29043.0
d__Bacteria;p__Firmicutes;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__[Eubacterium]_ruminantium_group;s__uncultured_bacterium                       28523.0
d__B

In [18]:
collapsed_table_df = collapsed_table.view(pd.DataFrame)
[(*(x.split(';')[-2:]), int(y)) for x, y in collapsed_table_df.sum(axis=0).sort_values(ascending=False).items()]

[('g__Muribaculaceae', 's__uncultured_bacterium', 395859),
 ('g__Prevotellaceae_NK3B31_group', 's__uncultured_bacterium', 135875),
 ('g__Prevotellaceae_UCG-001', 's__uncultured_bacterium', 71521),
 ('g__[Eubacterium]_siraeum_group', 's__uncultured_bacterium', 37478),
 ('g__Turicibacter', 's__uncultured_bacterium', 29043),
 ('g__[Eubacterium]_ruminantium_group', 's__uncultured_bacterium', 28523),
 ('g__Muribaculaceae', 's__uncultured_Bacteroidales', 26290),
 ('g__Alistipes', 's__uncultured_bacterium', 15042),
 ('g__Clostridia_vadinBB60_group', 's__unidentified', 13504),
 ('g__Clostridia_vadinBB60_group', 's__uncultured_bacterium', 12575),
 ('g__Mucispirillum', 's__Mucispirillum_schaedleri', 8377),
 ('g__Muribaculum', 's__uncultured_bacterium', 7727),
 ('g__Candidatus_Saccharimonas', 's__uncultured_bacterium', 7602),
 ('g__[Eubacterium]_xylanophilum_group', 's__uncultured_bacterium', 7375),
 ('g__Bacteroides', 's__Bacteroides_caecimuris', 7299),
 ('g__Gastranaerophilales', 's__uncultured

## Diversity test 
**TODO** colocar no outro notebook esta parte

In [None]:
from qiime2.plugins.diversity.pipelines import alpha
from qiime2.plugins.diversity.visualizers import alpha_group_significance

metrics = ('ace', 'berger_parker_d', 'brillouin_d', 'chao1', 'chao1_ci', 'dominance', 'doubles', 'enspie', 'esty_ci', 'fisher_alpha', 'gini_index', 'goods_coverage', 'heip_e', 'kempton_taylor_q', 'lladser_pe', 'margalef', 'mcintosh_d', 'mcintosh_e', 'menhinick', 'michaelis_menten_fit', 'observed_features', 'osd', 'pielou_e', 'robbins', 'shannon', 'simpson', 'simpson_e', 'singles', 'strong')
metric = 'simpson_e'
alpha_diversity = alpha(table=collapsed_table, metric=metric).alpha_diversity
alpha_group_significance(alpha_diversity=alpha_diversity, metadata=metadata_qa).visualization

In [None]:
# metatax_bar_qv