Merge pull request #33 from mortonjt/ols

OLS
biocore · Aug 18, 2016 · eb871ea · eb871ea
2 parents ddc3d40 + 55cfcb2
commit eb871ea
Show file tree

Hide file tree

Showing 4 changed files with 384 additions and 2 deletions.
diff --git a/gneiss/__init__.py b/gneiss/__init__.py
@@ -8,5 +8,8 @@
 
 from __future__ import absolute_import, division, print_function
 
+from gneiss._formula import ols
+
+__all__ = ['ols']
 
 __version__ = "0.0.2"
diff --git a/gneiss/_formula.py b/gneiss/_formula.py
@@ -0,0 +1,227 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the GPLv3 License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+import statsmodels.formula.api as smf
+from skbio.stats.composition import ilr
+from gneiss.util import match, match_tips, rename_internal_nodes
+from gneiss._summary import RegressionResults
+from gneiss.balances import balance_basis
+
+
+def _intersect_of_table_metadata_tree(table, metadata, tree):
+    """ Matches tips, features and samples between the table, metadata
+    and tree.  This module returns the features and samples that are
+    contained in all 3 objects.
+
+    Parameters
+    ----------
+    table : pd.DataFrame
+        Contingency table where samples correspond to rows and
+        features correspond to columns.
+    metadata: pd.DataFrame
+        Metadata table that contains information about the samples contained
+        in the `table` object.  Samples correspond to rows and covariates
+        correspond to columns.
+    tree : skbio.TreeNode
+        Tree object where the leaves correspond to the columns contained in
+        the table.
+
+    Returns
+    -------
+    pd.DataFrame
+        Subset of `table` with common row names as `metadata`
+        and common columns as `tree.tips()`
+    pd.DataFrame
+        Subset of `metadata` with common row names as `table`
+    skbio.TreeNode
+        Subtree of `tree` with common tips as `table`
+    """
+    _table, _metadata = match(table, metadata)
+    _table, _tree = match_tips(_table, tree)
+    non_tips_no_name = [(n.name is None) for n in _tree.levelorder()
+                        if not n.is_tip()]
+    if len(non_tips_no_name) == 0:
+        raise ValueError('There are no internal nodes in `tree` after'
+                         'intersection with `table`.')
+
+    if len(_table.index) == 0:
+        raise ValueError('There are no internal nodes in `table` after '
+                         'intersection with `metadata`.')
+
+    if any(non_tips_no_name):
+        _tree = rename_internal_nodes(_tree)
+    return _table, _metadata, _tree
+
+
+def _to_balances(table, tree):
+    """ Converts a table of abundances to balances given a tree.
+
+    Parameters
+    ----------
+    table : pd.DataFrame
+        Contingency table where samples correspond to rows and
+        features correspond to columns.
+    tree : skbio.TreeNode
+        Tree object where the leaves correspond to the columns contained in
+        the table.
+
+    Returns
+    -------
+    pd.DataFrame
+        Contingency table where samples correspond to rows and
+        balances correspond to columns.
+    np.array
+        Orthonormal basis in the Aitchison simplex generated from `tree`.
+    """
+    non_tips = [n.name for n in tree.levelorder() if not n.is_tip()]
+    basis, _ = balance_basis(tree)
+
+    mat = ilr(table.values, basis=basis)
+    ilr_table = pd.DataFrame(mat,
+                             columns=non_tips,
+                             index=table.index)
+    return ilr_table, basis
+
+
+def ols(formula, table, metadata, tree, **kwargs):
+    """ Ordinary Least Squares applied to balances.
+
+    An ordinary least square regression is applied to each balance.
+
+    Parameters
+    ----------
+    formula : str
+        Formula representing the statistical equation to be evaluated.
+        These strings are similar to how equations are handled in R and
+        statsmodels. Note that the dependent variable in this string should
+        not be specified, since this method will be run on each of the
+        individual balances. See `patsy` for more details.
+    table : pd.DataFrame
+        Contingency table where samples correspond to rows and
+        features correspond to columns.
+    metadata: pd.DataFrame
+        Metadata table that contains information about the samples contained
+        in the `table` object.  Samples correspond to rows and covariates
+        correspond to columns.
+    tree : skbio.TreeNode
+        Tree object where the leaves correspond to the columns contained in
+        the table.
+    **kwargs : dict
+        Other arguments accepted into `statsmodels.regression.linear_model.OLS`
+
+    Returns
+    -------
+    RegressionResults
+        Container object that holds information about the overall fit.
+
+    Example
+    -------
+    >>> from gneiss import ols
+    >>> from skbio import TreeNode
+    >>> import pandas as pd
+
+    Here, we will define a table of proportions with 3 features
+    `a`, `b`, and `c` across 5 samples.
+
+    >>> proportions = pd.DataFrame(
+    ...     [[0.720463, 0.175157, 0.104380],
+    ...      [0.777794, 0.189095, 0.033111],
+    ...      [0.796416, 0.193622, 0.009962],
+    ...      [0.802058, 0.194994, 0.002948],
+    ...      [0.803731, 0.195401, 0.000868]],
+    ...     columns=['a', 'b', 'c'],
+    ...     index=['s1', 's2', 's3', 's4', 's5'])
+
+    Now we will define the environment variables that we want to
+    regress against the proportions.
+
+    >>> env_vars = pd.DataFrame({
+    ...     'temp': [20, 20, 20, 20, 21],
+    ...     'ph': [1, 2, 3, 4, 5]},
+    ...     index=['s1', 's2', 's3', 's4', 's5'])
+
+    Finally, we need to define a bifurcating tree used to convert the
+    proportions to balances.  If the internal nodes aren't labels,
+    a default labeling will be applied (i.e. `y1`, `y2`, ...)
+
+    >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
+
+    Once these 3 variables are defined, a regression can be performed.
+    These proportions will be converted to balances according to the
+    tree specified.  And the regression formula is specified to run
+    `temp` and `ph` against the proportions in a single model.
+
+    >>> res = ols('temp + ph', proportions, env_vars, tree)
+
+    From the summary results of the `ols` function, we can view the
+    pvalues according to how well each individual balance fitted in the
+    regression model.
+
+    >>> res.pvalues
+           Intercept            ph      temp
+    Y1  2.479592e-01  1.990984e-11  0.243161
+    Y2  6.089193e-10  5.052733e-01  0.279805
+
+    We can also view the balance coefficients estimated in the regression
+    model. These coefficients can also be viewed as proportions by passing
+    `project=True` as an argument in `res.coefficients()`.
+
+    >>> res.coefficients()
+        Intercept            ph      temp
+    Y1  -0.000499  9.999911e-01  0.000026
+    Y2   1.000035  2.865312e-07 -0.000002
+
+    The balance residuals from the model can be viewed as follows.  Again,
+    these residuals can be viewed as proportions by passing `project=True`
+    into `res.residuals()`
+
+    >>> res.residuals()
+                  Y1            Y2
+    s1 -4.121647e-06 -2.998793e-07
+    s2  6.226749e-07 -1.602904e-09
+    s3  1.111959e-05  9.028437e-07
+    s4 -7.620619e-06 -6.013615e-07
+    s5 -1.332268e-14 -2.375877e-14
+
+    The predicted balances can be obtained as follows.  Note that the predicted
+    proportions can also be obtained by passing `project=True` into
+    `res.predict()`
+
+    >>> res.predict()
+              Y1        Y2
+    s1  1.000009  0.999999
+    s2  2.000000  0.999999
+    s3  2.999991  0.999999
+    s4  3.999982  1.000000
+    s5  4.999999  0.999998
+
+    The overall model fit can be obtained as follows
+
+    >>> res.r2
+    0.99999999997996369
+
+    See Also
+    --------
+    statsmodels.regression.linear_model.OLS
+    """
+    table, metadata, tree = _intersect_of_table_metadata_tree(table,
+                                                              metadata,
+                                                              tree)
+    ilr_table, basis = _to_balances(table, tree)
+    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)
+
+    fits = []
+    for b in ilr_table.columns:
+        # mixed effects code is obtained here:
+        # http://stackoverflow.com/a/22439820/1167475
+        stats_formula = '%s ~ %s' % (b, formula)
+
+        mdf = smf.ols(stats_formula, data=data).fit()
+        fits.append(mdf)
+    return RegressionResults(fits, basis=basis,
+                             feature_names=table.columns)
diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py
@@ -0,0 +1,154 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the GPLv3 License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+import numpy as np
+import pandas as pd
+import pandas.util.testing as pdt
+import unittest
+from skbio.stats.composition import ilr_inv
+from skbio import TreeNode
+from gneiss._formula import ols
+
+
+class TestOLS(unittest.TestCase):
+
+    def setUp(self):
+        A = np.array  # aliasing for the sake of pep8
+        self.table = pd.DataFrame({
+            's1': ilr_inv(A([1., 1.])),
+            's2': ilr_inv(A([1., 2.])),
+            's3': ilr_inv(A([1., 3.])),
+            's4': ilr_inv(A([1., 4.])),
+            's5': ilr_inv(A([1., 5.]))},
+            index=['a', 'b', 'c']).T
+        self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
+        self.unannotated_tree = TreeNode.read(['(c, (b,a));'])
+        self.metadata = pd.DataFrame({
+            'lame': [1, 1, 1, 1, 1],
+            'real': [1, 2, 3, 4, 5]
+        }, index=['s1', 's2', 's3', 's4', 's5'])
+
+    def test_ols(self):
+        res = ols('real', self.table, self.metadata, self.tree)
+        res_coef = res.coefficients()
+        exp_coef = pd.DataFrame(
+            {'Intercept': [0, 1.00],
+             'real': [1.0, 0]},
+            index=['Y1', 'Y2'])
+
+        pdt.assert_frame_equal(res_coef, exp_coef,
+                               check_exact=False,
+                               check_less_precise=True)
+        # Double check to make sure the fit is perfect
+        self.assertAlmostEqual(res.r2, 1)
+
+        # Double check to make sure residuals are zero
+        exp_resid = pd.DataFrame([[0., 0.],
+                                  [0., 0.],
+                                  [0., 0.],
+                                  [0., 0.],
+                                  [0., 0.]],
+                                 index=['s1', 's2', 's3', 's4', 's5'],
+                                 columns=['Y1', 'Y2'])
+        pdt.assert_frame_equal(exp_resid, res.residuals())
+
+    def test_ols_rename(self):
+        res = ols('real', self.table, self.metadata,
+                  self.unannotated_tree)
+        res_coef = res.coefficients()
+        exp_coef = pd.DataFrame(
+            {'Intercept': [0, 1.00],
+             'real': [1.0, 0]},
+            index=['y0', 'y1'])
+
+        pdt.assert_frame_equal(res_coef, exp_coef,
+                               check_exact=False,
+                               check_less_precise=True)
+        # Double check to make sure the fit is perfect
+        self.assertAlmostEqual(res.r2, 1)
+
+        # Double check to make sure residuals are zero
+        exp_resid = pd.DataFrame([[0., 0.],
+                                  [0., 0.],
+                                  [0., 0.],
+                                  [0., 0.],
+                                  [0., 0.]],
+                                 index=['s1', 's2', 's3', 's4', 's5'],
+                                 columns=['y0', 'y1'])
+        pdt.assert_frame_equal(exp_resid, res.residuals())
+
+    def test_ols_immutable(self):
+        A = np.array  # aliasing for the sake of pep8
+        table = pd.DataFrame({
+            's1': ilr_inv(A([1., 1.])),
+            's2': ilr_inv(A([1., 2.])),
+            's3': ilr_inv(A([1., 3.])),
+            's4': ilr_inv(A([1., 4.])),
+            's5': ilr_inv(A([1., 5.])),
+            's6': ilr_inv(A([1., 5.]))},
+            index=['a', 'b', 'c']).T
+        exp_table = pd.DataFrame({
+            's1': ilr_inv(A([1., 1.])),
+            's2': ilr_inv(A([1., 2.])),
+            's3': ilr_inv(A([1., 3.])),
+            's4': ilr_inv(A([1., 4.])),
+            's5': ilr_inv(A([1., 5.])),
+            's6': ilr_inv(A([1., 5.]))},
+            index=['a', 'b', 'c']).T
+
+        tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;'])
+        exp_tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;'])
+        metadata = pd.DataFrame({
+            'lame': [1, 1, 1, 1, 1],
+            'real': [1, 2, 3, 4, 5]
+        }, index=['s1', 's2', 's3', 's4', 's5'])
+
+        ols('real + lame', table, metadata, tree)
+        self.assertEqual(str(table), str(exp_table))
+        self.assertEqual(str(exp_tree), str(tree))
+
+    def test_ols_empty_table(self):
+        A = np.array  # aliasing for the sake of pep8
+        table = pd.DataFrame({
+            's1': ilr_inv(A([1., 1.])),
+            's2': ilr_inv(A([1., 2.])),
+            's3': ilr_inv(A([1., 3.])),
+            's4': ilr_inv(A([1., 4.])),
+            's5': ilr_inv(A([1., 5.])),
+            's6': ilr_inv(A([1., 5.]))},
+            index=['x', 'y', 'z']).T
+
+        tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;'])
+        metadata = pd.DataFrame({
+            'lame': [1, 1, 1, 1, 1],
+            'real': [1, 2, 3, 4, 5]
+        }, index=['s1', 's2', 's3', 's4', 's5'])
+        with self.assertRaises(ValueError):
+            ols('real + lame', table, metadata, tree)
+
+    def test_ols_empty_metadata(self):
+        A = np.array  # aliasing for the sake of pep8
+        table = pd.DataFrame({
+            'k1': ilr_inv(A([1., 1.])),
+            'k2': ilr_inv(A([1., 2.])),
+            'k3': ilr_inv(A([1., 3.])),
+            'k4': ilr_inv(A([1., 4.])),
+            'k5': ilr_inv(A([1., 5.])),
+            'k6': ilr_inv(A([1., 5.]))},
+            index=['a', 'b', 'c']).T
+
+        tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;'])
+        metadata = pd.DataFrame({
+            'lame': [1, 1, 1, 1, 1],
+            'real': [1, 2, 3, 4, 5]
+        }, index=['s1', 's2', 's3', 's4', 's5'])
+        with self.assertRaises(ValueError):
+            ols('real + lame', table, metadata, tree)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # ----------------------------------------------------------------------------
 # Copyright (c) 2016--, gneiss development team.
 #