# Set up notebook environment
## Note: This notebook should be run with a kernel for a conda environment with QIIME2 installed.

In [1]:
import pandas as pd
import qiime2 as q2
import numpy as np
import plotnine as pn
from biom import Table, load_table
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.diversity.actions import beta_phylogenetic
from qiime2.plugins.diversity.actions import beta
from qiime2.plugins.diversity.actions import alpha
from qiime2.plugins.feature_table.actions import rarefy
from skbio import DistanceMatrix

s="sample"
o="observation"

%matplotlib inline


## Import data
### Note: Change the paths to the inputs below to those desired

In [2]:
# Data
qza = q2.Artifact.load("table.qza")
bt = qza.view(Table)

# Rarefied data
qza_rare = q2.Artifact.load("table_rarefied_12690.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("metadata_samples.txt",sep='\t', index_col=0)
md.index.name="sample_name"

# Tree
tree_q2 = q2.Artifact.load("phylogeny.qza")

# Filter table to include samples in metadata and re-index metadata
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances
### Note: Change the value on line 1 below to the desired rarefaction depth, and the second value on line 15 to the desired minimum read count for samples for calculating RPCA distances

In [3]:
rarefaction_depth = 12690

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=rarefaction_depth, min_feature_frequency=10)
dms["deicode"] = dm.view(DistanceMatrix)




## Generate dataframes for plotting
### Note: Change the variables on line 1 below to the desired sample type variable and the desired grouping variable. Change the variables on lines 8 and 10-12 to match the new variables ('sample1_' and 'sample2_' can be left as is)

In [4]:
md_variable = md.loc[:,["sample_type_3","extraction_kit_round"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_variable, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_kit_round":"sample1_extraction_kit_round"})
    df = df.merge(md_variable, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_kit_round":"sample2_extraction_kit_round"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_plate"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_kit_round==sample2_extraction_kit_round & sample1!=sample2')
    out_dfs[metric] = df
    

## Export data frames
### Note: Change the output paths below to those desired

In [9]:
metric="jaccard"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("data_tech_reps_jaccard.txt", sep = '\t', index = False)


In [10]:
metric="deicode"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("data_tech_reps_rpca.txt", sep = '\t', index = False)


In [11]:
metric="unweighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("data_tech_reps_unifrac.txt", sep = '\t', index = False)


In [12]:
metric="weighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("data_tech_reps_weighted_unifrac.txt", sep = '\t', index = False)


# Proceed to plotting in R