#### Adam Klie<br>04/08/2020
## Process downloaded data into necessary Qiime2 artifacts
 - __Input__: 
     - samples.biom table : BIOMV210Format
     - metadata.tsv : tab seperated table
 - __Output__
     - _Artifacts_
         - table.qza : FeatureTable[Frequency]
         - rep-seqs.qza : FeatureData[Sequence]
         - insertion-tree.qza : Phylogeny[Rooted]
         - filtered-table.qza : FeatureTable[Frequency]
         - bespoke-taxonomy.qza : FeatureData[Taxonomy]
     - _Visualizations_
         - table.qzv : Results
     - _Other_
         - rep_seqs.fna : DNAFasta

## Requirements
 - qiime2-2020.2 environment loaded as in README.md

In [1]:
import biom
import qiime2
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from qiime2.plugins import feature_table, fragment_insertion, feature_classifier

In [2]:
data_dir = '../data/test/2020_05_05'
table_dir = '../results/test/2020_05_06/feature_tables'
seq_dir = '../results/test/2020_05_06/rep_seqs/'
tree_dir = '../results/test/2020_05_06/fragment_insertion/'
tax_dir = '../results/test/2020_05_06/taxonomy/'

### Create FeatureTable[Frequency] artifact and load Metadata
 - Inputs: samples.biom, metadata.tsv
 - Outputs: FeatureTable[Frequency], Metadata objects
 - Plugins: feature-table
 - Other: Artifact API

In [3]:
# Load data
table = qiime2.Artifact.import_data(type='FeatureTable[Frequency]', 
                                    view='{}/samples.biom'.format(data_dir), 
                                    view_type='BIOMV210Format')
metadata = qiime2.Metadata.load('{}/metadata.tsv'.format(data_dir))

In [4]:
# Visualize the unfiltered-table
table_vis = feature_table.visualizers.summarize(table=table, sample_metadata=metadata)
#table_vis.visualization

### Filter and rarify the feature table and sequences
 - Input: table.qza
 - Output: filtered_table.qza
 - Plugins: feature-table
 - Notes: 
     - Rarefy based on total feature frequency defined by threshold
     - Include stool samples and healthy individuals
     - Keep first occurence of host_subject_id (1 sample per subject)

In [5]:
# Min number of features
threshold = 1000

In [6]:
# Inclusion criterion
criterion = "[body_site]='UBERON:feces' AND [subset_healthy] IN ('true', 'True', 'TRUE')"

In [7]:
# Keep only one sample if multiple samples from same subject
ids_to_keep = metadata.get_column('host_subject_id').to_series().drop_duplicates().index
dedup_metadata = metadata.filter_ids(ids_to_keep)

In [8]:
# Filter FeatureTable[Frequency] with feature-table filter-samples method
filtered_table = feature_table.methods.filter_samples(table=table, 
                                                       min_frequency=threshold,
                                                       metadata=dedup_metadata,
                                                       where=criterion)

In [9]:
# Visualize the filtered table
filtered_table_vis = feature_table.visualizers.summarize(table=filtered_table.filtered_table, sample_metadata=dedup_metadata)
#filtered_table_vis.visualization

### Extract representive sequence
 - Inputs: filtered-table.qza -- FeatureTable[Frequency]
 - Outputs: rep_seqs.fna -- DNAFasta, rep-seqs.qza -- FeatureTable[Sequences]
 - Plugins: Artifact, feature-tabl

In [10]:
# Generate Fasta file
with open('{}/rep_seqs.fna'.format(seq_dir), 'w') as f:
    seqs = ''
    for i,seq in enumerate(filtered_table.filtered_table.view(pd.DataFrame).columns):
        seqs = seqs + '>' + 'seq' + str(i+1) + '\n' + seq + '\n'
    f.write(seqs[:-1])

In [11]:
# Import as a FeatureData[Sequence] artifact
rep_seqs = qiime2.Artifact.import_data(type='FeatureData[Sequence]', view='{}/rep_seqs.fna'.format(seq_dir))
rep_seqs.save('{}/rep-seqs.qza'.format(seq_dir))

'../results/test/2020_05_06/rep_seqs//rep-seqs.qza'

In [12]:
# Visualize the sequences
tabulate_seqs_vis = feature_table.visualizers.tabulate_seqs(data=rep_seqs)
#tabulate_seqs_vis.visualization

```python
# Visualize using a heatmap
%%time
h_map = feature_table.vismetadataers.heatmap(table=table, sample_metadata=metadata.get_column('exercise_frequency'), normalize=True, cluster='features')
```

### Create a fragment insertion tree
 - Inputs: rep-seqs.qza -- FeatureTable[Sequence], sepp-refs-gg-13.8.qza -- SeppReferenceDatabase
 - Outputs: sepp-tree -- , filtered-table.qza FeatureTable[Frequency]
 - Plugins: Artifact, feature-table

In [13]:
sepp_db= qiime2.Artifact.load('../data/sepp/sepp-refs-gg-13.8.qza')

In [None]:
sepp_tree = fragment_insertion.methods.sepp(representative_sequences=rep_seqs,
                                            reference_database=sepp_db,
                                            threads=2)

In [None]:
insertion_table = fragment_insertion.methods.filter_features(table=filtered_table.filtered_table, tree=sepp_tree)

In [None]:
sepp_tree.save('()/sepp-tree.qza'.format(tree_dir))
insertions_table.save('{}/filtered-table.qza'.format(table_dir))

### Create a taxonomy classifier and perform classification on sequences
 - Inputs: filtered-table.qza -- FeatureTable[Frequency]
 - Outputs: rep_seqs.fna -- DNAFasta, rep-seqs.qza -- FeatureTable[Sequences]
 - Plugins: Artifact, feature-table

In [None]:
human_stool_weights = qiime2.Artifact.load('../data/taxonomy/human-stool.qza')
ref_seqs_v4 = qiime2.Artifact.load('../data/taxonomy/ref-seqs-v4.qza')
ref_tax = qiime2.Artifact.load('../data/taxonomy/ref-tax.qza')

In [None]:
human_stool_v4_classifier = feature_classifier.methods.fit_classifier_naive_bayes(reference_reads=ref_seqs_v4,
                                                                                  reference_taxonomy=ref_tax,
                                                                                  class_weight=human_stool_weights)

In [None]:
bespoke_taxonomy = feature_classifier.methods.classify_sklearn(reads=rep_seqs, classifier=human_stool_v4_classifier)

In [None]:
bespoke_taxonomy.save('{}/bespoke_taxonomy.qza'.format(tax_dir))