### STEP : PICRUST2 Analysis



#### Example

- [PICRUST2 tutorial](https://github.com/picrust/picrust2/wiki/q2-picrust2-Tutorial)
- [Limitations](https://github.com/picrust/picrust2/wiki/Key-Limitations)


#### Methods
- [composition](https://docs.qiime2.org/2022.8/plugins/available/composition/)

## Setup and settings

In [1]:
# Importing packages
import os
import biom
import pandas as pd
from qiime2 import Artifact
from qiime2 import Visualization
from qiime2 import Metadata

from qiime2.plugins.feature_table.visualizers import summarize

from picrust2.pipeline import full_pipeline
from picrust2.default import (default_ref_dir, default_tables, default_regroup_map, default_pathway_map)
from qiime2.plugins import picrust2
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2.plugins.feature_table.methods import filter_seqs

%matplotlib inline

### Receiving the parameters

The following cell can receive parameters using the [papermill](https://papermill.readthedocs.io/en/latest/) tool.

In [2]:
metadata_file = '/home/lauro/nupeb/rede-micro/redemicro-miliane-nutri/data/raw/metadata/miliane-metadata-CxAC.tsv'
base_dir = os.path.join('/', 'home', 'lauro', 'nupeb', 'rede-micro', 'redemicro-miliane-nutri')
experiment_name = 'miliane-CxAC-trim'
class_col = 'group-id'
replace_files = False

In [3]:
# Parameters
experiment_name = "ana-flavia-NRxSTD-NR-trim"
base_dir = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri"
manifest_file = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/manifest/manifest-ana-flavia-NRxSTD-NR.csv"
metadata_file = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/metadata/metadata-ana-flavia-NRxSTD-NR.tsv"
class_col = "group-id"
classifier_file = "/home/lauro/nupeb/rede-micro/models/silva-138-99-nb-classifier.qza"
top_n = 20
replace_files = False
phred = 20
trunc_f = 0
trunc_r = 0
overlap = 12
threads = 6
trim = {
    "overlap": 8,
    "forward_primer": "CCTACGGGRSGCAGCAG",
    "reverse_primer": "GGACTACHVGGGTWTCTAAT",
}


In [4]:
experiment_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name))
img_folder = os.path.abspath(os.path.join(experiment_folder, 'imgs'))

### Defining names, paths and flags

In [5]:
# QIIME2 Artifacts folder
qiime_folder = os.path.join(experiment_folder, 'qiime-artifacts')

# Input - DADA2 Artifacts
dada2_tabs_path = os.path.join(qiime_folder, 'dada2-tabs.qza')
dada2_reqs_path = os.path.join(qiime_folder, 'dada2-reps.qza')

# PICRUST@ folder
picrust2_folder = os.path.abspath(os.path.join(experiment_folder, 'picrust2'))

# Create path if it not exist
if not os.path.isdir(picrust2_folder):
    os.makedirs(picrust2_folder)
    print(f'New picrust2-artifacts folder path created: {picrust2_folder}')

In [6]:
# Define paths for metagenome function artifacts
ec_path = os.path.join(picrust2_folder, 'ec-pred-metagen.qza')
ko_path = os.path.join(picrust2_folder, 'ko-pred-metagen.qza')
pathway_path = os.path.join(picrust2_folder, 'pathway-abundance.qza')

# Define paths for metagenome function visualization artifacts
ec_viz_path = ec_path[:-1]+'v'
ko_viz_path = ko_path[:-1]+'v'
pathway_viz_path = pathway_path[:-1]+'v'

# Define paths for biom -> tsv files (function X sample)
pathway_fpath = os.path.join(picrust2_folder, 'pathway.tsv')
ec_fpath = os.path.join(picrust2_folder, 'ec.tsv')
ko_fpath = os.path.join(picrust2_folder, 'ko.tsv')

# Define paths for biom -> tsv files (function X sample) - with descriptions
pathway_desc_fpath = os.path.join(picrust2_folder, 'pathway-desc.tsv')
ec_desc_fpath = os.path.join(picrust2_folder, 'ec-desc.tsv')
ko_desc_fpath = os.path.join(picrust2_folder, 'ko-desc.tsv')

## Step execution

### Load input files

This Step import the QIIME2 `FeatureTable[Frequency]` Artifact and the `Metadata` file.

In [7]:
#Load Metadata
metadata_qa = Metadata.load(metadata_file)

#Load FeatureTable[Frequency]
tabs = Artifact.load(dada2_tabs_path)

#Load FeatureTable[Sequence]
seqs = Artifact.load(dada2_reqs_path)

In [8]:
# Filter FeatureTable[Frequency | RelativeFrequency | PresenceAbsence | Composition] based on Metadata sample ID values
tabs = filter_samples(
    table=tabs,
    metadata=metadata_qa,
).filtered_table
# Filter SampleData[SequencesWithQuality | PairedEndSequencesWithQuality | JoinedSequencesWithQuality] based on Metadata sample ID values; returns FeatureData[Sequence | AlignedSequence]
seqs = filter_seqs(
    data=seqs,
    table=tabs,
).filtered_data

### Execute full pipelie

The entire PICRUSt2 pipeline will be run using a single method, called `picrust2.methods.full_pipeline`. This method will run each of the 4 key steps: 

1. sequence placement
2. hidden-state prediction of genomes
3. metagenome prediction
4. pathway-level predictions.

More information on [Documentation](https://github.com/picrust/picrust2/wiki/Full-pipeline-script).

In [9]:
need_pipeline = replace_files
need_pipeline |= not (os.path.isfile(ec_path) and os.path.isfile(ko_path) and os.path.isfile(pathway_path))
if need_pipeline:
    results = picrust2.methods.full_pipeline(
        table=tabs, 
        seq=seqs, 
        threads=6, 
        placement_tool='sepp',
        hsp_method='pic', 
        max_nsti=2,
        highly_verbose=True
    )
    ec_metagenome = results.ec_metagenome
    ko_metagenome = results.ko_metagenome
    pathway_abundance = results.pathway_abundance
else:
    ec_metagenome = Artifact.load(ec_path)
    ko_metagenome = Artifact.load(ko_path)
    pathway_abundance = Artifact.load(pathway_path)

### Persist created artifacts

We will define file paths and persist all artifacts. We start with `.qza` files. We will save the visualization files in sequence as `qzv` files. Finally, we save a `biom`-like file as `tsv` with brief descriptions of all functions.

In [10]:
if need_pipeline:
    # Export artifact folder
    ec_metagenome.export_data(output_dir=ec_path.split('.')[0])
    ko_metagenome.export_data(output_dir=ko_path.split('.')[0])
    pathway_abundance.export_data(output_dir=pathway_path.split('.')[0])
    
    # Save artifacts as .qza files
    ec_metagenome.save(ec_path)
    ko_metagenome.save(ko_path)
    pathway_abundance.save(pathway_path)

In [11]:
need_viz = replace_files
need_viz |= not (os.path.isfile(ec_viz_path) and os.path.isfile(ko_viz_path) and os.path.isfile(pathway_viz_path))
if need_viz:
    # Create visualization artifacts
    ec_viz = summarize(table=ec_metagenome, sample_metadata=metadata_qa).visualization
    ko_viz = summarize(table=ko_metagenome, sample_metadata=metadata_qa).visualization
    path_viz = summarize(table=pathway_abundance, sample_metadata=metadata_qa).visualization

    # Save visualization artifacts as .qzv files
    ec_viz.save(ec_viz_path)
    ko_viz.save(ko_viz_path)
    path_viz.save(pathway_viz_path)

In [12]:
need_biom = replace_files
need_biom |= not (os.path.isfile(ec_fpath) and os.path.isfile(ko_fpath) and os.path.isfile(pathway_fpath))
if need_biom:
    # Convert artifact to dataframe - transposed
    df_pathway = pathway_abundance.view(pd.DataFrame).T
    df_ec = ec_metagenome.view(pd.DataFrame).T
    df_ko = ko_metagenome.view(pd.DataFrame).T

    # Write dataframe to a tsv file
    df_pathway.to_csv(pathway_fpath, sep='\t', index=True)
    df_ec.to_csv(ec_fpath, sep='\t', index=True)
    df_ko.to_csv(ko_fpath, sep='\t', index=True)
    
    # Put a new column with descriptions of each function's ID
    !add_descriptions.py -i {ec_fpath} -m EC -o {ec_desc_fpath}
    !add_descriptions.py -i {ko_fpath} -m KO -o {ko_desc_fpath}
    !add_descriptions.py -i {pathway_fpath} -m METACYC -o {pathway_desc_fpath}