biocore · cameronmartino · Apr 4, 2021 · Mar 18, 2021 · Mar 18, 2021 · Mar 18, 2021
diff --git a/gemelli/ctf.py b/gemelli/ctf.py
@@ -4,9 +4,12 @@
 import pandas as pd
 from pandas import concat
 from pandas import DataFrame
+from q2_types.tree import NewickFormat
 from skbio import OrdinationResults, DistanceMatrix, TreeNode
 from gemelli.factorization import TensorFactorization
-from gemelli.preprocessing import build, tensor_rclr, fast_unifrac
+from gemelli.preprocessing import (build, tensor_rclr,
+                                   fast_unifrac,
+                                   bp_read_phylogeny)
 from gemelli._defaults import (DEFAULT_COMP, DEFAULT_MSC,
                                DEFAULT_MFC, DEFAULT_BL,
                                DEFAULT_MTD,
@@ -15,7 +18,7 @@
 
 
 def phylogenetic_ctf(table: biom.Table,
-                     phylogeny: TreeNode,
+                     phylogeny: NewickFormat,
                      sample_metadata: DataFrame,
                      individual_id_column: str,
                      state_column: str,
@@ -61,7 +64,7 @@ def phylogenetic_ctf(table: biom.Table,
 
 
 def phylogenetic_ctf_helper(table: biom.Table,
-                            phylogeny: TreeNode,
+                            phylogeny: NewickFormat,
                             sample_metadata: DataFrame,
                             individual_id_column: str,
                             state_column: list,
@@ -79,6 +82,7 @@ def phylogenetic_ctf_helper(table: biom.Table,
                                 DistanceMatrix, DataFrame, DataFrame,
                                 TreeNode, biom.Table):
 
+    phylogeny = bp_read_phylogeny(table, phylogeny)
     # check the table for validity and then filter
     process_results = ctf_table_processing(table,
                                            sample_metadata,

diff --git a/gemelli/preprocessing.py b/gemelli/preprocessing.py
@@ -13,6 +13,36 @@
 from .base import _BaseConstruct
 from gemelli._defaults import DEFAULT_MTD
 from skbio.diversity._util import _vectorize_counts_and_tree
+from bp import parse_newick, to_skbio_treenode
+
+
+def bp_read_phylogeny(table, phylogeny):
+    """
+    Fast way to read in phylogeny in newick
+    format and return in TreeNode format.
+
+    Parameters
+    ----------
+    table: biom.Table - a table of shape (M,N)
+        N = Features (i.e. OTUs, metabolites)
+        M = Samples
+
+    phylogeny: str - path to file/data
+                     in newick format
+
+    Examples
+    --------
+    TODO
+
+    """
+
+    # import file path
+    with open(str(phylogeny)) as treefile:
+        phylogeny = parse_newick(treefile.readline())
+        phylogeny = phylogeny.shear(set((table.ids('observation')).flatten()))
+        phylogeny = to_skbio_treenode(phylogeny)
+
+    return phylogeny
 
 
 def tensor_rclr(T, branch_lengths=None):

diff --git a/gemelli/rpca.py b/gemelli/rpca.py
@@ -13,16 +13,19 @@
 from typing import Union
 from skbio import TreeNode, OrdinationResults, DistanceMatrix
 from gemelli.matrix_completion import MatrixCompletion
-from gemelli.preprocessing import matrix_rclr, fast_unifrac
+from gemelli.preprocessing import (matrix_rclr,
+                                   fast_unifrac,
+                                   bp_read_phylogeny)
 from gemelli._defaults import (DEFAULT_COMP, DEFAULT_MTD,
                                DEFAULT_MSC, DEFAULT_MFC,
                                DEFAULT_OPTSPACE_ITERATIONS,
                                DEFAULT_MFF)
 from scipy.linalg import svd
+from q2_types.tree import NewickFormat
 
 
 def phylogenetic_rpca(table: biom.Table,
-                      phylogeny: TreeNode,
+                      phylogeny: NewickFormat,
                       n_components: Union[int, str] = DEFAULT_COMP,
                       min_sample_count: int = DEFAULT_MSC,
                       min_feature_count: int = DEFAULT_MFC,
@@ -39,6 +42,7 @@ def phylogenetic_rpca(table: biom.Table,
        gemelli.
     """
 
+    phylogeny = bp_read_phylogeny(table, phylogeny)
     # use helper to process table
     table = rpca_table_processing(table,
                                   min_sample_count,

diff --git a/gemelli/scripts/_standalone_ctf.py b/gemelli/scripts/_standalone_ctf.py
@@ -2,7 +2,6 @@
 import click
 from .__init__ import cli
 import pandas as pd
-from skbio import TreeNode
 from biom.util import biom_open
 from biom import load_table
 from gemelli.ctf import (ctf_helper, phylogenetic_ctf_helper)
@@ -117,8 +116,6 @@ def standalone_phylogenetic_ctf(in_biom: str,
                      if state is not None]
     # import table
     table = load_table(in_biom)
-    # import phylogeny
-    phylogeny = TreeNode.read(in_phylogeny, format='newick')
     # import sample metadata
     sample_metadata = pd.read_csv(sample_metadata_file,
                                   sep='\t', index_col=0,
@@ -132,7 +129,7 @@ def standalone_phylogenetic_ctf(in_biom: str,
         feature_metadata = None
     # run CTF
     res_ = phylogenetic_ctf_helper(table,
-                                   phylogeny,
+                                   in_phylogeny,
                                    sample_metadata,
                                    individual_id_column,
                                    state_columns,

diff --git a/gemelli/scripts/_standalone_rpca.py b/gemelli/scripts/_standalone_rpca.py
@@ -1,7 +1,6 @@
 import os
 import click
 from .__init__ import cli
-from skbio import TreeNode
 from biom import load_table
 from biom.util import biom_open
 from gemelli.rpca import rpca as _rpca
@@ -73,11 +72,9 @@ def standalone_phylogenetic_rpca(in_biom: str,
 
     # import table
     table = load_table(in_biom)
-    # import phylogeny
-    phylogeny = TreeNode.read(in_phylogeny, format='newick')
     # run the RPCA wrapper
     phylo_res_ = _phylo_rpca(table,
-                             phylogeny,
+                             in_phylogeny,
                              n_components,
                              min_sample_count,
                              min_feature_count,

diff --git a/gemelli/scripts/_standalone_transforms.py b/gemelli/scripts/_standalone_transforms.py
@@ -1,10 +1,10 @@
 import os
 import click
 from .__init__ import cli
-from skbio import TreeNode
 from biom import load_table
 from biom.util import biom_open
-from gemelli.preprocessing import (rclr_transformation,
+from gemelli.preprocessing import (bp_read_phylogeny,
+                                   rclr_transformation,
                                    phylogenetic_rclr_transformation)
 from gemelli._defaults import DESC_COUNTS, DESC_TREE
 
@@ -33,7 +33,7 @@ def standalone_phylogenetic_rclr(in_biom: str,
     # import table
     table = load_table(in_biom)
     # import phylogeny
-    phylogeny = TreeNode.read(in_phylogeny, format='newick')
+    phylogeny = bp_read_phylogeny(table, in_phylogeny)
     # run vectorized table and rclr transform
     res_ = phylogenetic_rclr_transformation(table, phylogeny)
     counts_by_node, rclr_table, phylogeny = res_

diff --git a/setup.py b/setup.py
@@ -108,7 +108,8 @@ def run(self):
           'scikit-learn >= 0.18.1',
           'scikit-bio > 0.5.3',
           'biom-format',
-          'h5py', ],
+          'h5py',
+          'iow'],
       classifiers=classifiers,
       entry_points={'qiime2.plugins': q2cmds,
                     'console_scripts': standalone},