Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Loads tree in NewickFormat, uses BP to parse tree faster #42

Merged
merged 9 commits into from
Apr 4, 2021
Merged
10 changes: 7 additions & 3 deletions gemelli/ctf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import pandas as pd
from pandas import concat
from pandas import DataFrame
from q2_types.tree import NewickFormat
from skbio import OrdinationResults, DistanceMatrix, TreeNode
from gemelli.factorization import TensorFactorization
from gemelli.preprocessing import build, tensor_rclr, fast_unifrac
from gemelli.preprocessing import (build, tensor_rclr,
fast_unifrac,
bp_read_phylogeny)
from gemelli._defaults import (DEFAULT_COMP, DEFAULT_MSC,
DEFAULT_MFC, DEFAULT_BL,
DEFAULT_MTD,
Expand All @@ -15,7 +18,7 @@


def phylogenetic_ctf(table: biom.Table,
phylogeny: TreeNode,
phylogeny: NewickFormat,
sample_metadata: DataFrame,
individual_id_column: str,
state_column: str,
Expand Down Expand Up @@ -61,7 +64,7 @@ def phylogenetic_ctf(table: biom.Table,


def phylogenetic_ctf_helper(table: biom.Table,
phylogeny: TreeNode,
phylogeny: NewickFormat,
sample_metadata: DataFrame,
individual_id_column: str,
state_column: list,
Expand All @@ -79,6 +82,7 @@ def phylogenetic_ctf_helper(table: biom.Table,
DistanceMatrix, DataFrame, DataFrame,
TreeNode, biom.Table):

phylogeny = bp_read_phylogeny(table, phylogeny)
# check the table for validity and then filter
process_results = ctf_table_processing(table,
sample_metadata,
Expand Down
30 changes: 30 additions & 0 deletions gemelli/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,36 @@
from .base import _BaseConstruct
from gemelli._defaults import DEFAULT_MTD
from skbio.diversity._util import _vectorize_counts_and_tree
from bp import parse_newick, to_skbio_treenode


def bp_read_phylogeny(table, phylogeny):
"""
Fast way to read in phylogeny in newick
format and return in TreeNode format.

Parameters
----------
table: biom.Table - a table of shape (M,N)
N = Features (i.e. OTUs, metabolites)
M = Samples

phylogeny: str - path to file/data
in newick format

Examples
--------
TODO

"""

# import file path
with open(str(phylogeny)) as treefile:
phylogeny = parse_newick(treefile.readline())
phylogeny = phylogeny.shear(set((table.ids('observation')).flatten()))
phylogeny = to_skbio_treenode(phylogeny)

return phylogeny


def tensor_rclr(T, branch_lengths=None):
Expand Down
8 changes: 6 additions & 2 deletions gemelli/rpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,19 @@
from typing import Union
from skbio import TreeNode, OrdinationResults, DistanceMatrix
from gemelli.matrix_completion import MatrixCompletion
from gemelli.preprocessing import matrix_rclr, fast_unifrac
from gemelli.preprocessing import (matrix_rclr,
fast_unifrac,
bp_read_phylogeny)
from gemelli._defaults import (DEFAULT_COMP, DEFAULT_MTD,
DEFAULT_MSC, DEFAULT_MFC,
DEFAULT_OPTSPACE_ITERATIONS,
DEFAULT_MFF)
from scipy.linalg import svd
from q2_types.tree import NewickFormat


def phylogenetic_rpca(table: biom.Table,
phylogeny: TreeNode,
phylogeny: NewickFormat,
n_components: Union[int, str] = DEFAULT_COMP,
min_sample_count: int = DEFAULT_MSC,
min_feature_count: int = DEFAULT_MFC,
Expand All @@ -39,6 +42,7 @@ def phylogenetic_rpca(table: biom.Table,
gemelli.
"""

phylogeny = bp_read_phylogeny(table, phylogeny)
# use helper to process table
table = rpca_table_processing(table,
min_sample_count,
Expand Down
5 changes: 1 addition & 4 deletions gemelli/scripts/_standalone_ctf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import click
from .__init__ import cli
import pandas as pd
from skbio import TreeNode
from biom.util import biom_open
from biom import load_table
from gemelli.ctf import (ctf_helper, phylogenetic_ctf_helper)
Expand Down Expand Up @@ -117,8 +116,6 @@ def standalone_phylogenetic_ctf(in_biom: str,
if state is not None]
# import table
table = load_table(in_biom)
# import phylogeny
phylogeny = TreeNode.read(in_phylogeny, format='newick')
# import sample metadata
sample_metadata = pd.read_csv(sample_metadata_file,
sep='\t', index_col=0,
Expand All @@ -132,7 +129,7 @@ def standalone_phylogenetic_ctf(in_biom: str,
feature_metadata = None
# run CTF
res_ = phylogenetic_ctf_helper(table,
phylogeny,
in_phylogeny,
sample_metadata,
individual_id_column,
state_columns,
Expand Down
5 changes: 1 addition & 4 deletions gemelli/scripts/_standalone_rpca.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
import click
from .__init__ import cli
from skbio import TreeNode
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca as _rpca
Expand Down Expand Up @@ -73,11 +72,9 @@ def standalone_phylogenetic_rpca(in_biom: str,

# import table
table = load_table(in_biom)
# import phylogeny
phylogeny = TreeNode.read(in_phylogeny, format='newick')
# run the RPCA wrapper
phylo_res_ = _phylo_rpca(table,
phylogeny,
in_phylogeny,
n_components,
min_sample_count,
min_feature_count,
Expand Down
6 changes: 3 additions & 3 deletions gemelli/scripts/_standalone_transforms.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
import click
from .__init__ import cli
from skbio import TreeNode
from biom import load_table
from biom.util import biom_open
from gemelli.preprocessing import (rclr_transformation,
from gemelli.preprocessing import (bp_read_phylogeny,
rclr_transformation,
phylogenetic_rclr_transformation)
from gemelli._defaults import DESC_COUNTS, DESC_TREE

Expand Down Expand Up @@ -33,7 +33,7 @@ def standalone_phylogenetic_rclr(in_biom: str,
# import table
table = load_table(in_biom)
# import phylogeny
phylogeny = TreeNode.read(in_phylogeny, format='newick')
phylogeny = bp_read_phylogeny(table, in_phylogeny)
# run vectorized table and rclr transform
res_ = phylogenetic_rclr_transformation(table, phylogeny)
counts_by_node, rclr_table, phylogeny = res_
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def run(self):
'scikit-learn >= 0.18.1',
'scikit-bio > 0.5.3',
'biom-format',
'h5py', ],
'h5py',
'iow'],
classifiers=classifiers,
entry_points={'qiime2.plugins': q2cmds,
'console_scripts': standalone},
Expand Down