# Set up notebook environment
## NOTE: Use a QIIME2 kernel

In [25]:
import os
import biom
import warnings
import pickle
import numpy as np
import pandas as pd
import qiime2 as q2
from biom import Table
from skbio import OrdinationResults
from skbio.stats import subsample_counts
from skbio.stats.distance import permanova, anosim, mantel
from skbio.stats.distance import DistanceMatrix
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.feature_table.actions import rarefy
from qiime2.plugins.diversity.actions import beta_group_significance
from qiime2.plugins.emperor.actions import biplot, plot
from qiime2.plugins.diversity.actions import (beta,
                                              beta_phylogenetic,
                                              pcoa)
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition

from assets.step_wise_anova import run_stepwise_anova
from qiime2.plugins.fragment_insertion.actions import filter_features
warnings.filterwarnings("ignore", category=DeprecationWarning)

# helper functions
from assets.util import (mantel_matched, simulate_depth,
                        all_dists, nested_permanova)

# plotting
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
%matplotlib inline


# Subset metadata to make paired files between extraction kits

In [26]:
SEQ_TYPE='shotgun'

In [27]:
# Read in sample metadata
md = pd.read_csv('{SEQ_TYPE}/metadata.tsv'.format(SEQ_TYPE=SEQ_TYPE), sep = '\t')
md = md[(md['description'] != 'BLANK') & (md['description'] != 'POSITIVE_CONTROL')]
md = md[~(md['sample_name'].isin([
    '14332.363197859', '14332.361164111',
    '14332.363197857', '14332.363197849',
    '14332.363197875', 
    '14332.363197831', '14332.363197893'

]))]
md = md[md.env_package == 'human-skin']
md = md[md['env_feature'] == 'human-associated habitat']
md.to_csv('shotgun/remove_blanks_md.tsv', sep='\t', index=False)
md

Unnamed: 0,sample_name,bacteria,bacteria_function,biomass,buffer_removal,col,collection_timestamp,compressed_well,description,diet,...,sample_type2,scientific_name,sex,storage_solution,swab_type,taxon_id,title,vivarium,water,replicate_id
1,14332.361158597,not applicable,not applicable,low,dry,8.0,3/24/22,O8,skin_hand_right,not applicable,...,skin_hand_right,human skin metagenome,female,isopropanol,swube,539655.0,Matrix_pipeline_validation,not applicable,not applicable,D skin_hand_right
3,14332.361162759,not applicable,not applicable,low,dry,3.0,3/24/22,E3,skin_armpit_right,not applicable,...,skin_armpit_right,human skin metagenome,male,etoh,swube,539655.0,Matrix_pipeline_validation,not applicable,not applicable,B skin_armpit_right
4,14332.361162761,not applicable,not applicable,low,dry,5.0,3/24/22,G5,skin_armpit_right,not applicable,...,skin_armpit_right,human skin metagenome,female,etoh,swube,539655.0,Matrix_pipeline_validation,not applicable,not applicable,C skin_armpit_right
11,14332.361162773,not applicable,not applicable,low,dry,4.0,3/24/22,C4,skin_hand_right,not applicable,...,skin_hand_right,human skin metagenome,male,etoh,swube,539655.0,Matrix_pipeline_validation,not applicable,not applicable,B skin_hand_right
21,14332.361162788,not applicable,not applicable,low,dry,6.0,3/24/22,G6,skin_hand_right,not applicable,...,skin_hand_right,human skin metagenome,female,etoh,swube,539655.0,Matrix_pipeline_validation,not applicable,not applicable,C skin_hand_right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,14332.363197876,not applicable,not applicable,low,dry,8.0,3/24/22,H8,skin_hand_left,not applicable,...,skin_hand_left,human skin metagenome,female,isopropanol,custom,539655.0,Matrix_pipeline_validation,not applicable,not applicable,D skin_hand_left
236,14332.363197877,not applicable,not applicable,low,dry,2.0,3/23/22,J2,skin_hand_left,not applicable,...,skin_hand_left,human skin metagenome,female,isopropanol,custom,539655.0,Matrix_pipeline_validation,not applicable,not applicable,A skin_hand_left
240,14332.363197882,not applicable,not applicable,low,dry,7.0,3/24/22,D7,skin_armpit_left,not applicable,...,skin_armpit_left,human skin metagenome,female,etoh,custom,539655.0,Matrix_pipeline_validation,not applicable,not applicable,D skin_armpit_left
243,14332.363197890,not applicable,not applicable,low,dry,8.0,3/24/22,D8,skin_hand_left,not applicable,...,skin_hand_left,human skin metagenome,female,etoh,custom,539655.0,Matrix_pipeline_validation,not applicable,not applicable,D skin_hand_left


# Stepwise RDA

## Shotgun data

In [28]:
# Import data
md_round1and2_bothPS_q2 = q2.Metadata.load('shotgun/remove_blanks_md.tsv')

table_shotgun = q2.Artifact.load('shotgun/tree-filtered-table.qza')

tree_shotgun = q2.Artifact.load('shotgun/tree.qza')


In [29]:
# Filter table
table_shotgun_biom = table_shotgun.view(Table)
md_round1and2_bothPS_df = md_round1and2_bothPS_q2.to_dataframe()
shared_ = list(set(table_shotgun_biom.ids()) & set(md_round1and2_bothPS_df.index))
md_round1and2_bothPS_df_shotgun = md_round1and2_bothPS_df.reindex(shared_)
table_shotgun_biom_bothPS = table_shotgun_biom.filter(shared_)
keep_ = table_shotgun_biom_bothPS.ids('observation')[table_shotgun_biom_bothPS.sum('observation') > 0]
table_shotgun_biom_bothPS.filter(keep_, axis='observation')

# Import filtered table and re-indexed metadata file
table_shotgun_bothPS = q2.Artifact.import_data('FeatureTable[Frequency]', table_shotgun_biom_bothPS)
md_round1and2_bothPS_df_shotgun_q2 = q2.Metadata(md_round1and2_bothPS_df_shotgun)

# Generate distance matrices using 'all_dissts' utils
rare_depth_shotgun = 1000
dists_res_shotgun = all_dists(table_shotgun_bothPS,
                      rare_depth_shotgun, tree_shotgun)

# Generate ordinations (row=samples, cols=axes)
pcoa_res_shotgun = {}
pcoa_res_shotgun['Jaccard'] = pcoa(dists_res_shotgun['Jaccard'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res_shotgun['Unweighted UniFrac'] = pcoa(dists_res_shotgun['Unweighted UniFrac'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res_shotgun['Weighted UniFrac'] = pcoa(dists_res_shotgun['Weighted UniFrac'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res_shotgun['RPCA'] = dists_res_shotgun['RPCA'].biplot.view(OrdinationResults).samples


In [30]:
# Perform stepwise RDA-ANOVA
es_all = {}
use_ = ['host_subject_id', 'storage_solution', 'extraction_protocol']

# Clean up meta (only stuff to run)
mf_ord = md_round1and2_bothPS_df_shotgun_q2.to_dataframe().copy()

# Filter data
keep_ = [v_ for v_ in mf_ord.columns
         if len(set(mf_ord[v_])) > 1 and
         len(set(mf_ord[v_])) < mf_ord.shape[0]//2]
mf_ord = mf_ord[keep_]
print(len(keep_))
# Run stepwise ANOVA for all RDA ordinations
for metric_, ord_ in  pcoa_res_shotgun.items():
    # get first three axes
    ord_ = ord_[[0,1,2]]
    ord_.columns = ['PC1','PC2','PC3']
    # subset/match
    mf_ord_ = mf_ord.copy()
    shared_ids = list(set(ord_.index)\
                      & set(mf_ord_.index))
    mf_ord_ = mf_ord_.loc[shared_ids,:]
    ord_ = ord_.loc[shared_ids,:]
    es_all[metric_] = run_stepwise_anova(ord_, mf_ord_, use_) #mf_ord_.columns)

# Concat output from all runs and export
es_alldf = pd.concat(es_all).rename({'+ sample_type2':'+ sample_type'}, axis=0)
# es_alldf.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/stepwise_anova/stepwise_anova_shotgun.txt', sep='\t')
es_alldf


23
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: /home/kalen/matrix-tube-analysis/matrix-2.0/present/assets/stepwise-rda.R /tmp/tmpcxqvxa8v/ord_.tsv /tmp/tmpcxqvxa8v/mf_.txt /tmp/tmpcxqvxa8v/output.effect.size.tsv

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: /home/kalen/matrix-tube-analysis/matrix-2.0/present/assets/stepwise-rda.R /tmp/tmp4dk63md9/ord_.tsv /tmp/tmp4dk63md9/mf_.txt /tmp/tmp4dk63md9/output.effect.size.tsv

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they

Unnamed: 0,Unnamed: 1,R2.adj,Df,AIC,F,Pr(>F)
Jaccard,+ host_subject_id,0.197749,3,68.000576,6.915822,0.0002
Jaccard,+ extraction_protocol,0.027091,1,66.427146,3.411491,0.020196
Unweighted UniFrac,+ host_subject_id,0.174812,3,70.058383,6.084295,0.0002
Unweighted UniFrac,+ extraction_protocol,0.040887,1,67.282892,4.597131,0.004199
Weighted UniFrac,+ host_subject_id,0.143698,3,72.760282,5.027496,0.0002
RPCA,+ host_subject_id,0.203455,3,67.479509,7.130121,0.0002


In [31]:
pcoa_res_shotgun['RPCA'].shape

(73, 3)