From d0093df53b3389426ba798d79954528284ba00c3 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Sat, 30 Jul 2016 18:09:54 -0700 Subject: [PATCH 01/11] Adding in formulas --- gneiss/_formula.py | 113 +++++++++++++++++++++++++++++++++++ gneiss/tests/test_formula.py | 64 ++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 gneiss/_formula.py create mode 100644 gneiss/tests/test_formula.py diff --git a/gneiss/_formula.py b/gneiss/_formula.py new file mode 100644 index 0000000..22fa7e3 --- /dev/null +++ b/gneiss/_formula.py @@ -0,0 +1,113 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import pandas as pd +import statsmodels.formula.api as smf +from skbio.stats.composition import ilr +from gneiss.util import match, match_tips +from gneiss._summary import RegressionResults +from gneiss.balances import balance_basis + + +def _process(table, metadata, tree): + """ Matches tips, features and samples between the table, metadata + and tree. This module returns the features and samples that are + contained in all 3 objects. + """ + _table, _metadata = match(table, metadata) + _table, _tree = match_tips(table, tree) + non_tips_no_name = [(n.name is None) for n in tree.levelorder() + if n.is_tip()] + + if any(non_tips_no_name): + _tree = rename_internal_nodes(_tree) + return _table, _metadata, _tree + + +def ols(formula, table, metadata, tree, **kwargs): + """ Ordinary Least Squares applied to balances. + + A ordinary least square regression is applied to each balance. + + Parameters + ---------- + formula : str + Formula representing the statistical equation to be evaluated. + These strings are similar to how equations are handled in R. + Note that the dependent variable in this string should not be + specified, since this method will be run on each of the individual + balances. See `patsy` for more details. + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + metadata: pd.DataFrame + Metadata table that contains information about the samples contained + in the `table` object. Samples correspond to rows and covariates + correspond to columns. + tree : skbio.TreeNode + Tree object where the leaves correspond to the columns contained in + the table. + **kwargs : dict + Other arguments accepted into `statsmodels.regression.linear_model.OLS` + + Returns + ------- + RegressionResults + Container object that holds information about the overall fit. + + See Also + -------- + statsmodels.regression.linear_model.OLS + """ + _table, _metadata, _tree = _process(table, metadata, tree) + basis, _ = balance_basis(_tree) + non_tips = [n.name for n in _tree.levelorder() if not n.is_tip()] + + mat = ilr(_table.values, basis=basis) + ilr_table = pd.DataFrame(mat, + columns=non_tips, + index=table.index) + + data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) + + fits = [] + for b in ilr_table.columns: + # mixed effects code is obtained here: + # http://stackoverflow.com/a/22439820/1167475 + stats_formula = '%s ~ %s' % (b, formula) + + mdf = smf.ols(stats_formula, data=data).fit() + fits.append(mdf) + return RegressionResults(fits, basis=basis, + feature_names=table.columns) + + +def glm(formula, table, metadata, tree, **kwargs): + """ Generalized Linear Models applied to balances. + + Parameters + ---------- + """ + pass + + +def mixedlm(formula, table, metadata, tree, **kwargs): + """ Linear Mixed Models applied to balances. + + Parameters + ---------- + """ + pass + + +def gee(formula, table, metadata, tree, **kwargs): + """ Generalized Estimating Equations applied to balances. + + Parameters + ---------- + """ + pass diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py new file mode 100644 index 0000000..d6816eb --- /dev/null +++ b/gneiss/tests/test_formula.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import numpy as np +import pandas as pd +import pandas.util.testing as pdt +import statsmodels.formula.api as smf +import unittest +from gneiss._summary import RegressionResults +from skbio.stats.composition import _gram_schmidt_basis, ilr_inv +from skbio import TreeNode +from gneiss._formula import ols + + +class TestOLS(unittest.TestCase): + + def setUp(self): + A = np.array # aliasing for the sake of pep8 + self.table = pd.DataFrame({ + 's1': ilr_inv(A([1., 1.])), + 's2': ilr_inv(A([1., 2.])), + 's3': ilr_inv(A([1., 3.])), + 's4': ilr_inv(A([1., 4.])), + 's5': ilr_inv(A([1., 5.]))}, + index=['a', 'b', 'c']).T + self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + self.metadata = pd.DataFrame({ + 'lame': [1, 1, 1, 1, 1], + 'real': [1, 2, 3, 4, 5] + }, index=['s1', 's2', 's3', 's4', 's5']) + + def test_ols(self): + res = ols('real', self.table, self.metadata, self.tree) + res_coef = res.coefficients() + exp_coef = pd.DataFrame( + {'Intercept': [0, 1.00], + 'real': [1.0, 0]}, + index=['Y1', 'Y2']) + + pdt.assert_frame_equal(res_coef, exp_coef, + check_exact=False, + check_less_precise=True) + # Double check to make sure the fit is perfect + self.assertAlmostEqual(res.r2, 1) + + # Double check to make sure residuals are zero + exp_resid = pd.DataFrame([[0., 0.], + [0., 0.], + [0., 0.], + [0., 0.], + [0., 0.]], + index=['s1', 's2', 's3', 's4', 's5'], + columns=['Y1', 'Y2']) + pdt.assert_frame_equal(exp_resid, res.residuals()) + + +if __name__ == '__main__': + unittest.main() From defa170b9fe76d247eb6854a7a01be1e24427ac8 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Sat, 30 Jul 2016 19:18:33 -0700 Subject: [PATCH 02/11] pep8 --- gneiss/_formula.py | 22 +++++++++++++--------- gneiss/_summary.py | 14 ++++++-------- gneiss/tests/test_formula.py | 4 +--- gneiss/tests/test_summary.py | 6 +++--- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 22fa7e3..c4a2dfc 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -8,7 +8,7 @@ import pandas as pd import statsmodels.formula.api as smf from skbio.stats.composition import ilr -from gneiss.util import match, match_tips +from gneiss.util import match, match_tips, rename_internal_nodes from gneiss._summary import RegressionResults from gneiss.balances import balance_basis @@ -28,6 +28,17 @@ def _process(table, metadata, tree): return _table, _metadata, _tree +def _transform(table, tree): + non_tips = [n.name for n in tree.levelorder() if not n.is_tip()] + basis, _ = balance_basis(tree) + + mat = ilr(table.values, basis=basis) + ilr_table = pd.DataFrame(mat, + columns=non_tips, + index=table.index) + return ilr_table, basis + + def ols(formula, table, metadata, tree, **kwargs): """ Ordinary Least Squares applied to balances. @@ -64,14 +75,7 @@ def ols(formula, table, metadata, tree, **kwargs): statsmodels.regression.linear_model.OLS """ _table, _metadata, _tree = _process(table, metadata, tree) - basis, _ = balance_basis(_tree) - non_tips = [n.name for n in _tree.levelorder() if not n.is_tip()] - - mat = ilr(_table.values, basis=basis) - ilr_table = pd.DataFrame(mat, - columns=non_tips, - index=table.index) - + ilr_table, basis = _transform(_table, _tree) data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) fits = [] diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 89bfd3d..77ce511 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -163,11 +163,10 @@ def residuals(self, project=False): # https://github.com/biocore/scikit-bio/pull/1396 proj_resid = ilr_inv(resid.values.T, basis=self.basis, check=False).T - proj_resid = pd.DataFrame(proj_resid, index=self.feature_names, - columns=resid.columns) - return proj_resid + return pd.DataFrame(proj_resid, index=self.feature_names, + columns=resid.columns).T else: - return resid + return resid.T def predict(self, X=None, project=False, **kwargs): """ Performs a prediction based on model. @@ -210,9 +209,8 @@ def predict(self, X=None, project=False, **kwargs): # https://github.com/biocore/scikit-bio/pull/1396 proj_prediction = ilr_inv(prediction.values.T, basis=self.basis, check=False) - proj_prediction = pd.DataFrame(proj_prediction, - columns=self.feature_names, - index=prediction.columns) - return proj_prediction + return pd.DataFrame(proj_prediction, + columns=self.feature_names, + index=prediction.columns).T return prediction.T diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index d6816eb..8e9233a 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -10,10 +10,8 @@ import numpy as np import pandas as pd import pandas.util.testing as pdt -import statsmodels.formula.api as smf import unittest -from gneiss._summary import RegressionResults -from skbio.stats.composition import _gram_schmidt_basis, ilr_inv +from skbio.stats.composition import ilr_inv from skbio import TreeNode from gneiss._formula import ols diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py index ef9e15d..ed1d1c6 100644 --- a/gneiss/tests/test_summary.py +++ b/gneiss/tests/test_summary.py @@ -117,7 +117,7 @@ def test_regression_results_residuals_projection(self): 's5': ilr_inv(A([-1.065789, 1.184211])), 's6': ilr_inv(A([-1.144737, -0.394737])), 's7': ilr_inv(A([0.394737, 1.894737]))}, - index=['Z1', 'Z2', 'Z3']) + index=['Z1', 'Z2', 'Z3']).T feature_names = ['Z1', 'Z2', 'Z3'] basis = _gram_schmidt_basis(3) res = RegressionResults(self.results, basis=basis, @@ -134,7 +134,7 @@ def test_regression_results_residuals(self): 's5': [-1.065789, 1.184211], 's6': [-1.144737, -0.394737], 's7': [0.394737, 1.894737]}, - index=['Y1', 'Y2']) + index=['Y1', 'Y2']).T res = RegressionResults(self.results) pdt.assert_frame_equal(res.residuals(), exp_resid, check_exact=False, @@ -183,7 +183,7 @@ def test_regression_results_predict_projection(self): 's5': ilr_inv(A([3.065789, 3.815789])), 's6': ilr_inv(A([4.144737, 6.394737])), 's7': ilr_inv(A([3.605263, 5.105263]))}, - index=feature_names).T + index=feature_names) pdt.assert_frame_equal(res_predict, exp_predict) From 6ebc03810de553bd6da4383cd6d6fd83aec86a67 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 18:38:09 -0700 Subject: [PATCH 03/11] Correcting dimensions --- gneiss/tests/test_summary.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py index eb077b0..5dcce3b 100644 --- a/gneiss/tests/test_summary.py +++ b/gneiss/tests/test_summary.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python # ---------------------------------------------------------------------------- @@ -183,7 +184,7 @@ def test_regression_results_predict_projection(self): 's5': ilr_inv(A([3.065789, 3.815789])), 's6': ilr_inv(A([4.144737, 6.394737])), 's7': ilr_inv(A([3.605263, 5.105263]))}, - index=feature_names) + index=feature_names).T pdt.assert_frame_equal(res_predict, exp_predict) From e5b463da5683cac99b6e09ffbb8fa09d1de4ee57 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 18:41:00 -0700 Subject: [PATCH 04/11] pep8 --- gneiss/tests/test_summary.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py index 5dcce3b..dd47148 100644 --- a/gneiss/tests/test_summary.py +++ b/gneiss/tests/test_summary.py @@ -1,5 +1,4 @@ - -#!/usr/bin/env python +# !/usr/bin/env python # ---------------------------------------------------------------------------- # Copyright (c) 2016--, gneiss development team. From 8d47419dc2f81951ff0f9855d78996068a800053 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 18:46:30 -0700 Subject: [PATCH 05/11] COV: More complete coverage --- gneiss/_formula.py | 27 --------------------------- gneiss/tests/test_formula.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index c4a2dfc..17a3c12 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -88,30 +88,3 @@ def ols(formula, table, metadata, tree, **kwargs): fits.append(mdf) return RegressionResults(fits, basis=basis, feature_names=table.columns) - - -def glm(formula, table, metadata, tree, **kwargs): - """ Generalized Linear Models applied to balances. - - Parameters - ---------- - """ - pass - - -def mixedlm(formula, table, metadata, tree, **kwargs): - """ Linear Mixed Models applied to balances. - - Parameters - ---------- - """ - pass - - -def gee(formula, table, metadata, tree, **kwargs): - """ Generalized Estimating Equations applied to balances. - - Parameters - ---------- - """ - pass diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 8e9233a..4159aaf 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -28,6 +28,7 @@ def setUp(self): 's5': ilr_inv(A([1., 5.]))}, index=['a', 'b', 'c']).T self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + self.unannotated_tree = TreeNode.read(['(c, (b,a));']) self.metadata = pd.DataFrame({ 'lame': [1, 1, 1, 1, 1], 'real': [1, 2, 3, 4, 5] @@ -57,6 +58,31 @@ def test_ols(self): columns=['Y1', 'Y2']) pdt.assert_frame_equal(exp_resid, res.residuals()) + def test_ols_rename(self): + res = ols('real', self.table, self.metadata, + self.unannotated_tree) + res_coef = res.coefficients() + exp_coef = pd.DataFrame( + {'Intercept': [0, 1.00], + 'real': [1.0, 0]}, + index=['Y1', 'Y2']) + + pdt.assert_frame_equal(res_coef, exp_coef, + check_exact=False, + check_less_precise=True) + # Double check to make sure the fit is perfect + self.assertAlmostEqual(res.r2, 1) + + # Double check to make sure residuals are zero + exp_resid = pd.DataFrame([[0., 0.], + [0., 0.], + [0., 0.], + [0., 0.], + [0., 0.]], + index=['s1', 's2', 's3', 's4', 's5'], + columns=['Y1', 'Y2']) + pdt.assert_frame_equal(exp_resid, res.residuals()) + if __name__ == '__main__': unittest.main() From 67b937e89f9d836a7e6451786e1a99a56cadf80f Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 18:49:50 -0700 Subject: [PATCH 06/11] TST: Correcting rename nodes scenario --- gneiss/_formula.py | 2 +- gneiss/tests/test_formula.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 17a3c12..4ce4013 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -21,7 +21,7 @@ def _process(table, metadata, tree): _table, _metadata = match(table, metadata) _table, _tree = match_tips(table, tree) non_tips_no_name = [(n.name is None) for n in tree.levelorder() - if n.is_tip()] + if not n.is_tip()] if any(non_tips_no_name): _tree = rename_internal_nodes(_tree) diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 4159aaf..441be78 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -65,7 +65,7 @@ def test_ols_rename(self): exp_coef = pd.DataFrame( {'Intercept': [0, 1.00], 'real': [1.0, 0]}, - index=['Y1', 'Y2']) + index=['y0', 'y1']) pdt.assert_frame_equal(res_coef, exp_coef, check_exact=False, @@ -80,7 +80,7 @@ def test_ols_rename(self): [0., 0.], [0., 0.]], index=['s1', 's2', 's3', 's4', 's5'], - columns=['Y1', 'Y2']) + columns=['y0', 'y1']) pdt.assert_frame_equal(exp_resid, res.residuals()) From 77898f6f7ea4198d0d4aee133997b834a645833b Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 22:35:44 -0700 Subject: [PATCH 07/11] DOC: Adding more documentation Addressing @wasade and @ElDeveloper comments. --- gneiss/__init__.py | 3 + gneiss/_formula.py | 106 ++++++++++++++++++++++++++++++++--- gneiss/tests/test_formula.py | 34 ++++++++++- 3 files changed, 132 insertions(+), 11 deletions(-) diff --git a/gneiss/__init__.py b/gneiss/__init__.py index a27c03f..f7feef1 100644 --- a/gneiss/__init__.py +++ b/gneiss/__init__.py @@ -8,5 +8,8 @@ from __future__ import absolute_import, division, print_function +from gneiss._formula import ols + +__all__ = ['ols'] __version__ = "0.0.2" diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 4ce4013..420226d 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -13,14 +13,14 @@ from gneiss.balances import balance_basis -def _process(table, metadata, tree): +def _intersect_of_table_metadata_tree(table, metadata, tree): """ Matches tips, features and samples between the table, metadata and tree. This module returns the features and samples that are contained in all 3 objects. """ _table, _metadata = match(table, metadata) - _table, _tree = match_tips(table, tree) - non_tips_no_name = [(n.name is None) for n in tree.levelorder() + _table, _tree = match_tips(_table, tree) + non_tips_no_name = [(n.name is None) for n in _tree.levelorder() if not n.is_tip()] if any(non_tips_no_name): @@ -42,16 +42,16 @@ def _transform(table, tree): def ols(formula, table, metadata, tree, **kwargs): """ Ordinary Least Squares applied to balances. - A ordinary least square regression is applied to each balance. + An ordinary least square regression is applied to each balance. Parameters ---------- formula : str Formula representing the statistical equation to be evaluated. - These strings are similar to how equations are handled in R. - Note that the dependent variable in this string should not be - specified, since this method will be run on each of the individual - balances. See `patsy` for more details. + These strings are similar to how equations are handled in R and + statsmodels. Note that the dependent variable in this string should + not be specified, since this method will be run on each of the + individual balances. See `patsy` for more details. table : pd.DataFrame Contingency table where samples correspond to rows and features correspond to columns. @@ -70,11 +70,99 @@ def ols(formula, table, metadata, tree, **kwargs): RegressionResults Container object that holds information about the overall fit. + Example + ------- + >>> from gneiss import ols + >>> from skbio import TreeNode + >>> import pandas as pd + + Here, we will define a table of proportions with 3 features + `a`, `b`, and `c` across 5 samples. + + >>> proportions = pd.DataFrame( + ... [[0.720463, 0.175157, 0.104380], + ... [0.777794, 0.189095, 0.033111], + ... [0.796416, 0.193622, 0.009962], + ... [0.802058, 0.194994, 0.002948], + ... [0.803731, 0.195401, 0.000868]], + ... columns=['a', 'b', 'c'], + ... index=['s1', 's2', 's3', 's4', 's5']) + + Now we will define the environment variables that we want to + regress against the proportions. + + >>> env_vars = pd.DataFrame({ + ... 'temp': [20, 20, 20, 20, 21], + ... 'ph': [1, 2, 3, 4, 5]}, + ... index=['s1', 's2', 's3', 's4', 's5']) + + Finally, we need to define a bifurcating tree used to convert the + proportions to balances. If the internal nodes aren't labels, + a default labeling will be applied (i.e. `y1`, `y2`, ...) + + >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + + Once these 3 variables are defined, a regression can be performed. + These proportions will be converted to balances according to the + tree specified. And the regression formula is specified to run + `temp` and `ph` against the proportions in a single model. + + >>> res = ols('temp + ph', proportions, env_vars, tree) + + From the summary results of the `ols` function, we can view the + pvalues according to how well each individual balance fitted in the + regression model. + + >>> res.pvalues + Intercept ph temp + Y1 2.479592e-01 1.990984e-11 0.243161 + Y2 6.089193e-10 5.052733e-01 0.279805 + + We can also view the balance coefficients estimated in the regression + model. These coefficients can also be viewed as proportions by passing + `project=True` as an argument in `res.coefficients()`. + + >>> res.coefficients() + Intercept ph temp + Y1 -0.000499 9.999911e-01 0.000026 + Y2 1.000035 2.865312e-07 -0.000002 + + The balance residuals from the model can be viewed as follows. Again, + these residuals can be viewed as proportions by passing `project=True` + into `res.residuals()` + + >>> res.residuals() + Y1 Y2 + s1 -4.121647e-06 -2.998793e-07 + s2 6.226749e-07 -1.602904e-09 + s3 1.111959e-05 9.028437e-07 + s4 -7.620619e-06 -6.013615e-07 + s5 -1.332268e-14 -2.375877e-14 + + The predicted balances can be obtained as follows. Note that the predicted + proportions can also be obtained by passing `project=True` into + `res.predict()` + + >>> res.predict() + Y1 Y2 + s1 1.000009 0.999999 + s2 2.000000 0.999999 + s3 2.999991 0.999999 + s4 3.999982 1.000000 + s5 4.999999 0.999998 + + The overall model fit can be obtained as follows + + >>> res.r2 + 0.99999999997996369 + See Also -------- statsmodels.regression.linear_model.OLS """ - _table, _metadata, _tree = _process(table, metadata, tree) + _table, _metadata, _tree = _intersect_of_table_metadata_tree(table, + metadata, + tree) ilr_table, basis = _transform(_table, _tree) data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 441be78..419879c 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - # ---------------------------------------------------------------------------- # Copyright (c) 2016--, gneiss development team. # @@ -83,6 +81,38 @@ def test_ols_rename(self): columns=['y0', 'y1']) pdt.assert_frame_equal(exp_resid, res.residuals()) + def test_ols_immutable(self): + A = np.array # aliasing for the sake of pep8 + table = pd.DataFrame({ + 's1': ilr_inv(A([1., 1.])), + 's2': ilr_inv(A([1., 2.])), + 's3': ilr_inv(A([1., 3.])), + 's4': ilr_inv(A([1., 4.])), + 's5': ilr_inv(A([1., 5.])), + 's6': ilr_inv(A([1., 5.]))}, + index=['a', 'b', 'c']).T + exp_table = pd.DataFrame({ + 's1': ilr_inv(A([1., 1.])), + 's2': ilr_inv(A([1., 2.])), + 's3': ilr_inv(A([1., 3.])), + 's4': ilr_inv(A([1., 4.])), + 's5': ilr_inv(A([1., 5.])), + 's6': ilr_inv(A([1., 5.]))}, + index=['a', 'b', 'c']).T + + tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) + exp_tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) + metadata = pd.DataFrame({ + 'lame': [1, 1, 1, 1, 1], + 'real': [1, 2, 3, 4, 5] + }, index=['s1', 's2', 's3', 's4', 's5']) + + ols('real + lame', table, metadata, tree) + self.assertEqual(str(table), str(exp_table)) + self.assertEqual(str(exp_tree), str(tree)) + + def test_ols_empty(self): + pass if __name__ == '__main__': unittest.main() From ed81fc6b3ff58ba1207a22e6277cc2f3640c6ef4 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 22:40:56 -0700 Subject: [PATCH 08/11] TST: Adding scenario where intersection yield empty tree, table, metadata --- gneiss/_formula.py | 7 +++++++ gneiss/tests/test_formula.py | 40 ++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 420226d..ab63a6f 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -22,6 +22,13 @@ def _intersect_of_table_metadata_tree(table, metadata, tree): _table, _tree = match_tips(_table, tree) non_tips_no_name = [(n.name is None) for n in _tree.levelorder() if not n.is_tip()] + if len(non_tips_no_name) == 0: + raise ValueError('There are no internal nodes in `tree` after' + 'intersection with `table`.') + + if len(_table.index) == 0: + raise ValueError('There are no internal nodes in `table` after ' + 'intersection with `metadata`.') if any(non_tips_no_name): _tree = rename_internal_nodes(_tree) diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 419879c..c45259f 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -111,8 +111,44 @@ def test_ols_immutable(self): self.assertEqual(str(table), str(exp_table)) self.assertEqual(str(exp_tree), str(tree)) - def test_ols_empty(self): - pass + def test_ols_empty_table(self): + A = np.array # aliasing for the sake of pep8 + table = pd.DataFrame({ + 's1': ilr_inv(A([1., 1.])), + 's2': ilr_inv(A([1., 2.])), + 's3': ilr_inv(A([1., 3.])), + 's4': ilr_inv(A([1., 4.])), + 's5': ilr_inv(A([1., 5.])), + 's6': ilr_inv(A([1., 5.]))}, + index=['x', 'y', 'z']).T + + tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) + metadata = pd.DataFrame({ + 'lame': [1, 1, 1, 1, 1], + 'real': [1, 2, 3, 4, 5] + }, index=['s1', 's2', 's3', 's4', 's5']) + with self.assertRaises(ValueError): + ols('real + lame', table, metadata, tree) + + def test_ols_empty_metadata(self): + A = np.array # aliasing for the sake of pep8 + table = pd.DataFrame({ + 'k1': ilr_inv(A([1., 1.])), + 'k2': ilr_inv(A([1., 2.])), + 'k3': ilr_inv(A([1., 3.])), + 'k4': ilr_inv(A([1., 4.])), + 'k5': ilr_inv(A([1., 5.])), + 'k6': ilr_inv(A([1., 5.]))}, + index=['a', 'b', 'c']).T + + tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) + metadata = pd.DataFrame({ + 'lame': [1, 1, 1, 1, 1], + 'real': [1, 2, 3, 4, 5] + }, index=['s1', 's2', 's3', 's4', 's5']) + with self.assertRaises(ValueError): + ols('real + lame', table, metadata, tree) + if __name__ == '__main__': unittest.main() From c228a46d0dc89da12653304f9e33cb60b609133d Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 22:45:31 -0700 Subject: [PATCH 09/11] Updating docstring --- gneiss/_formula.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index ab63a6f..0b0abb3 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -35,7 +35,25 @@ def _intersect_of_table_metadata_tree(table, metadata, tree): return _table, _metadata, _tree -def _transform(table, tree): +def _to_balances(table, tree): + """ Converts a table of abundances to balances given a tree. + Parameters + ---------- + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + tree : skbio.TreeNode + Tree object where the leaves correspond to the columns contained in + the table. + + Returns + ------- + pd.DataFrame : + Contingency table where samples correspond to rows and + balances correspond to columns. + np.array : + Orthonormal basis in the Aitchison simplex generated from `tree`. + """ non_tips = [n.name for n in tree.levelorder() if not n.is_tip()] basis, _ = balance_basis(tree) @@ -170,7 +188,7 @@ def ols(formula, table, metadata, tree, **kwargs): _table, _metadata, _tree = _intersect_of_table_metadata_tree(table, metadata, tree) - ilr_table, basis = _transform(_table, _tree) + ilr_table, basis = _to_balances(_table, _tree) data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) fits = [] From 1e7025e798c891d4705dc37289e8d8682cc84ab3 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 23:45:27 -0700 Subject: [PATCH 10/11] Remove shebang --- gneiss/tests/test_summary.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py index dd47148..ccecb3a 100644 --- a/gneiss/tests/test_summary.py +++ b/gneiss/tests/test_summary.py @@ -1,5 +1,3 @@ -# !/usr/bin/env python - # ---------------------------------------------------------------------------- # Copyright (c) 2016--, gneiss development team. # From ba760d1438818541fc46a53d060f2abdfd0482db Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 18 Aug 2016 09:43:47 -0700 Subject: [PATCH 11/11] Addressing @antgonza's comments --- gneiss/_formula.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 0b0abb3..8877ec1 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -17,6 +17,29 @@ def _intersect_of_table_metadata_tree(table, metadata, tree): """ Matches tips, features and samples between the table, metadata and tree. This module returns the features and samples that are contained in all 3 objects. + + Parameters + ---------- + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + metadata: pd.DataFrame + Metadata table that contains information about the samples contained + in the `table` object. Samples correspond to rows and covariates + correspond to columns. + tree : skbio.TreeNode + Tree object where the leaves correspond to the columns contained in + the table. + + Returns + ------- + pd.DataFrame + Subset of `table` with common row names as `metadata` + and common columns as `tree.tips()` + pd.DataFrame + Subset of `metadata` with common row names as `table` + skbio.TreeNode + Subtree of `tree` with common tips as `table` """ _table, _metadata = match(table, metadata) _table, _tree = match_tips(_table, tree) @@ -37,6 +60,7 @@ def _intersect_of_table_metadata_tree(table, metadata, tree): def _to_balances(table, tree): """ Converts a table of abundances to balances given a tree. + Parameters ---------- table : pd.DataFrame @@ -48,10 +72,10 @@ def _to_balances(table, tree): Returns ------- - pd.DataFrame : + pd.DataFrame Contingency table where samples correspond to rows and balances correspond to columns. - np.array : + np.array Orthonormal basis in the Aitchison simplex generated from `tree`. """ non_tips = [n.name for n in tree.levelorder() if not n.is_tip()] @@ -185,11 +209,11 @@ def ols(formula, table, metadata, tree, **kwargs): -------- statsmodels.regression.linear_model.OLS """ - _table, _metadata, _tree = _intersect_of_table_metadata_tree(table, - metadata, - tree) - ilr_table, basis = _to_balances(_table, _tree) - data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) + table, metadata, tree = _intersect_of_table_metadata_tree(table, + metadata, + tree) + ilr_table, basis = _to_balances(table, tree) + data = pd.merge(ilr_table, metadata, left_index=True, right_index=True) fits = [] for b in ilr_table.columns: