In [15]:
import os
from os import path
import pandas as pd
import hashlib
import gzip
import tempfile
from biom import load_table
from sklearn.preprocessing import LabelEncoder
from picrust2.wrap_hsp import (castor_hsp_workflow,
                               castor_nsti)
from sklearn import metrics
from sklearn.model_selection import LeaveOneOut

import numpy as np
from skbio import TreeNode
from io import StringIO
from pathlib import Path

In [21]:
#bacdive data
discdf = pd.read_table('../data/bacdive-data/gg-mapped-tree-data/bacdive-binary-matched.tsv',
                          index_col=0,low_memory=False)
discdf = discdf.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
discdf.index = discdf.index.astype(str)
keep_cols = [col_ for col_ in discdf.columns 
             if discdf.dropna(subset=[col_]).shape[0] > 100]
discdf = discdf[['Spore']].dropna()

# need to set the comp to false
sing_val_col = [col_ for col_ in discdf.columns
                if len(set(discdf.dropna(subset=[col_])[col_])) == 1]
discdf[sing_val_col] = discdf[sing_val_col].fillna(value=False)

# get table to inlcude
btst = load_table('../data/16S/data-subsets/pma-treatment-table/feature-table.biom')
seq_predict = btst.ids('observation')

# get SEPP tree
in_tree='../data/16S/83714_insertion_tree.relabelled.tre'
tree = TreeNode.read(StringIO(Path(in_tree).read_text()))
discdf.shape


(2443, 1)

In [58]:
# get a copy
discdf_subset = discdf.copy()
tree_subset = tree.copy()
sub_col = 'Spore'

tree_matched = list(set([node.name for node in tree_subset.tips()])\
                    &set(discdf.index))
tree_predict = list(set([node.name for node in tree_subset.tips()])\
                    &set(list(discdf.index) + list(seq_predict)))

# sub set table
discdf_subset.index.name = 'assembly'
discdf_subset = discdf_subset.reindex(index=tree_matched)
discdf_subset = pd.DataFrame(discdf_subset[sub_col].dropna())
enc = LabelEncoder().fit(discdf_subset[sub_col])
discdf_subset[sub_col] = enc.transform(discdf_subset[sub_col]).astype(int)
discdf_subset = pd.DataFrame(pd.to_numeric(discdf_subset[sub_col]))

# sub set the tree to only knowns
tree_subset = tree_subset.shear(tree_predict)

# generate the temp. directory to store res
with tempfile.TemporaryDirectory() as temp_dir_name:
    # save tree to a tmp dir
    tree_tmp = os.path.join(temp_dir_name, 'phylogeny_desc.tree')
    tree_subset.write(tree_tmp)
    # save traits to a tmp dir
    traits_tmp = os.path.join(temp_dir_name, 'traits_desc.tsv')
    discdf_subset.to_csv(traits_tmp, sep='\t')
    # run castor
    hsp_method='emp_prob'
    predict_out, ci_out = castor_hsp_workflow(tree_path=tree_tmp,
                                              trait_table_path=traits_tmp,
                                              calc_ci=True,
                                              hsp_method=hsp_method)


In [78]:
# generate the temp. directory to store res
with tempfile.TemporaryDirectory() as temp_dir_name:
    # save tree to a tmp dir
    tree_tmp = os.path.join(temp_dir_name, 'phylogeny_desc.tree')
    tree_subset.write(tree_tmp)
    # save traits to a tmp dir
    traits_tmp = os.path.join(temp_dir_name, 'traits_desc.tsv')
    discdf_subset.to_csv(traits_tmp, sep='\t')
    # run castor
    hsp_method='emp_prob'
    predict_out, ci_out = castor_hsp_workflow(tree_path=tree_tmp,
                                              trait_table_path=traits_tmp,
                                              chunk_size=1000,
                                              calc_ci=True,
                                              ran_seed=42,
                                              hsp_method=hsp_method)


In [79]:
predict_out['Spore_decoded'] = enc.inverse_transform(predict_out.Spore)
ci_out = ci_out.reindex(predict_out.index)
predict_out = pd.concat([predict_out, ci_out], axis=1)
predict_out.index.name = 'featureid'
predict_out.to_csv('../data/bacdive-data/gg-sep-predicted.tsv', sep='\t')
