-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #33 from mortonjt/ols
OLS
- Loading branch information
Showing
4 changed files
with
384 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, gneiss development team. | ||
# | ||
# Distributed under the terms of the GPLv3 License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
import pandas as pd | ||
import statsmodels.formula.api as smf | ||
from skbio.stats.composition import ilr | ||
from gneiss.util import match, match_tips, rename_internal_nodes | ||
from gneiss._summary import RegressionResults | ||
from gneiss.balances import balance_basis | ||
|
||
|
||
def _intersect_of_table_metadata_tree(table, metadata, tree): | ||
""" Matches tips, features and samples between the table, metadata | ||
and tree. This module returns the features and samples that are | ||
contained in all 3 objects. | ||
Parameters | ||
---------- | ||
table : pd.DataFrame | ||
Contingency table where samples correspond to rows and | ||
features correspond to columns. | ||
metadata: pd.DataFrame | ||
Metadata table that contains information about the samples contained | ||
in the `table` object. Samples correspond to rows and covariates | ||
correspond to columns. | ||
tree : skbio.TreeNode | ||
Tree object where the leaves correspond to the columns contained in | ||
the table. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
Subset of `table` with common row names as `metadata` | ||
and common columns as `tree.tips()` | ||
pd.DataFrame | ||
Subset of `metadata` with common row names as `table` | ||
skbio.TreeNode | ||
Subtree of `tree` with common tips as `table` | ||
""" | ||
_table, _metadata = match(table, metadata) | ||
_table, _tree = match_tips(_table, tree) | ||
non_tips_no_name = [(n.name is None) for n in _tree.levelorder() | ||
if not n.is_tip()] | ||
if len(non_tips_no_name) == 0: | ||
raise ValueError('There are no internal nodes in `tree` after' | ||
'intersection with `table`.') | ||
|
||
if len(_table.index) == 0: | ||
raise ValueError('There are no internal nodes in `table` after ' | ||
'intersection with `metadata`.') | ||
|
||
if any(non_tips_no_name): | ||
_tree = rename_internal_nodes(_tree) | ||
return _table, _metadata, _tree | ||
|
||
|
||
def _to_balances(table, tree): | ||
""" Converts a table of abundances to balances given a tree. | ||
Parameters | ||
---------- | ||
table : pd.DataFrame | ||
Contingency table where samples correspond to rows and | ||
features correspond to columns. | ||
tree : skbio.TreeNode | ||
Tree object where the leaves correspond to the columns contained in | ||
the table. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
Contingency table where samples correspond to rows and | ||
balances correspond to columns. | ||
np.array | ||
Orthonormal basis in the Aitchison simplex generated from `tree`. | ||
""" | ||
non_tips = [n.name for n in tree.levelorder() if not n.is_tip()] | ||
basis, _ = balance_basis(tree) | ||
|
||
mat = ilr(table.values, basis=basis) | ||
ilr_table = pd.DataFrame(mat, | ||
columns=non_tips, | ||
index=table.index) | ||
return ilr_table, basis | ||
|
||
|
||
def ols(formula, table, metadata, tree, **kwargs): | ||
""" Ordinary Least Squares applied to balances. | ||
An ordinary least square regression is applied to each balance. | ||
Parameters | ||
---------- | ||
formula : str | ||
Formula representing the statistical equation to be evaluated. | ||
These strings are similar to how equations are handled in R and | ||
statsmodels. Note that the dependent variable in this string should | ||
not be specified, since this method will be run on each of the | ||
individual balances. See `patsy` for more details. | ||
table : pd.DataFrame | ||
Contingency table where samples correspond to rows and | ||
features correspond to columns. | ||
metadata: pd.DataFrame | ||
Metadata table that contains information about the samples contained | ||
in the `table` object. Samples correspond to rows and covariates | ||
correspond to columns. | ||
tree : skbio.TreeNode | ||
Tree object where the leaves correspond to the columns contained in | ||
the table. | ||
**kwargs : dict | ||
Other arguments accepted into `statsmodels.regression.linear_model.OLS` | ||
Returns | ||
------- | ||
RegressionResults | ||
Container object that holds information about the overall fit. | ||
Example | ||
------- | ||
>>> from gneiss import ols | ||
>>> from skbio import TreeNode | ||
>>> import pandas as pd | ||
Here, we will define a table of proportions with 3 features | ||
`a`, `b`, and `c` across 5 samples. | ||
>>> proportions = pd.DataFrame( | ||
... [[0.720463, 0.175157, 0.104380], | ||
... [0.777794, 0.189095, 0.033111], | ||
... [0.796416, 0.193622, 0.009962], | ||
... [0.802058, 0.194994, 0.002948], | ||
... [0.803731, 0.195401, 0.000868]], | ||
... columns=['a', 'b', 'c'], | ||
... index=['s1', 's2', 's3', 's4', 's5']) | ||
Now we will define the environment variables that we want to | ||
regress against the proportions. | ||
>>> env_vars = pd.DataFrame({ | ||
... 'temp': [20, 20, 20, 20, 21], | ||
... 'ph': [1, 2, 3, 4, 5]}, | ||
... index=['s1', 's2', 's3', 's4', 's5']) | ||
Finally, we need to define a bifurcating tree used to convert the | ||
proportions to balances. If the internal nodes aren't labels, | ||
a default labeling will be applied (i.e. `y1`, `y2`, ...) | ||
>>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) | ||
Once these 3 variables are defined, a regression can be performed. | ||
These proportions will be converted to balances according to the | ||
tree specified. And the regression formula is specified to run | ||
`temp` and `ph` against the proportions in a single model. | ||
>>> res = ols('temp + ph', proportions, env_vars, tree) | ||
From the summary results of the `ols` function, we can view the | ||
pvalues according to how well each individual balance fitted in the | ||
regression model. | ||
>>> res.pvalues | ||
Intercept ph temp | ||
Y1 2.479592e-01 1.990984e-11 0.243161 | ||
Y2 6.089193e-10 5.052733e-01 0.279805 | ||
We can also view the balance coefficients estimated in the regression | ||
model. These coefficients can also be viewed as proportions by passing | ||
`project=True` as an argument in `res.coefficients()`. | ||
>>> res.coefficients() | ||
Intercept ph temp | ||
Y1 -0.000499 9.999911e-01 0.000026 | ||
Y2 1.000035 2.865312e-07 -0.000002 | ||
The balance residuals from the model can be viewed as follows. Again, | ||
these residuals can be viewed as proportions by passing `project=True` | ||
into `res.residuals()` | ||
>>> res.residuals() | ||
Y1 Y2 | ||
s1 -4.121647e-06 -2.998793e-07 | ||
s2 6.226749e-07 -1.602904e-09 | ||
s3 1.111959e-05 9.028437e-07 | ||
s4 -7.620619e-06 -6.013615e-07 | ||
s5 -1.332268e-14 -2.375877e-14 | ||
The predicted balances can be obtained as follows. Note that the predicted | ||
proportions can also be obtained by passing `project=True` into | ||
`res.predict()` | ||
>>> res.predict() | ||
Y1 Y2 | ||
s1 1.000009 0.999999 | ||
s2 2.000000 0.999999 | ||
s3 2.999991 0.999999 | ||
s4 3.999982 1.000000 | ||
s5 4.999999 0.999998 | ||
The overall model fit can be obtained as follows | ||
>>> res.r2 | ||
0.99999999997996369 | ||
See Also | ||
-------- | ||
statsmodels.regression.linear_model.OLS | ||
""" | ||
table, metadata, tree = _intersect_of_table_metadata_tree(table, | ||
metadata, | ||
tree) | ||
ilr_table, basis = _to_balances(table, tree) | ||
data = pd.merge(ilr_table, metadata, left_index=True, right_index=True) | ||
|
||
fits = [] | ||
for b in ilr_table.columns: | ||
# mixed effects code is obtained here: | ||
# http://stackoverflow.com/a/22439820/1167475 | ||
stats_formula = '%s ~ %s' % (b, formula) | ||
|
||
mdf = smf.ols(stats_formula, data=data).fit() | ||
fits.append(mdf) | ||
return RegressionResults(fits, basis=basis, | ||
feature_names=table.columns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, gneiss development team. | ||
# | ||
# Distributed under the terms of the GPLv3 License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
import numpy as np | ||
import pandas as pd | ||
import pandas.util.testing as pdt | ||
import unittest | ||
from skbio.stats.composition import ilr_inv | ||
from skbio import TreeNode | ||
from gneiss._formula import ols | ||
|
||
|
||
class TestOLS(unittest.TestCase): | ||
|
||
def setUp(self): | ||
A = np.array # aliasing for the sake of pep8 | ||
self.table = pd.DataFrame({ | ||
's1': ilr_inv(A([1., 1.])), | ||
's2': ilr_inv(A([1., 2.])), | ||
's3': ilr_inv(A([1., 3.])), | ||
's4': ilr_inv(A([1., 4.])), | ||
's5': ilr_inv(A([1., 5.]))}, | ||
index=['a', 'b', 'c']).T | ||
self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) | ||
self.unannotated_tree = TreeNode.read(['(c, (b,a));']) | ||
self.metadata = pd.DataFrame({ | ||
'lame': [1, 1, 1, 1, 1], | ||
'real': [1, 2, 3, 4, 5] | ||
}, index=['s1', 's2', 's3', 's4', 's5']) | ||
|
||
def test_ols(self): | ||
res = ols('real', self.table, self.metadata, self.tree) | ||
res_coef = res.coefficients() | ||
exp_coef = pd.DataFrame( | ||
{'Intercept': [0, 1.00], | ||
'real': [1.0, 0]}, | ||
index=['Y1', 'Y2']) | ||
|
||
pdt.assert_frame_equal(res_coef, exp_coef, | ||
check_exact=False, | ||
check_less_precise=True) | ||
# Double check to make sure the fit is perfect | ||
self.assertAlmostEqual(res.r2, 1) | ||
|
||
# Double check to make sure residuals are zero | ||
exp_resid = pd.DataFrame([[0., 0.], | ||
[0., 0.], | ||
[0., 0.], | ||
[0., 0.], | ||
[0., 0.]], | ||
index=['s1', 's2', 's3', 's4', 's5'], | ||
columns=['Y1', 'Y2']) | ||
pdt.assert_frame_equal(exp_resid, res.residuals()) | ||
|
||
def test_ols_rename(self): | ||
res = ols('real', self.table, self.metadata, | ||
self.unannotated_tree) | ||
res_coef = res.coefficients() | ||
exp_coef = pd.DataFrame( | ||
{'Intercept': [0, 1.00], | ||
'real': [1.0, 0]}, | ||
index=['y0', 'y1']) | ||
|
||
pdt.assert_frame_equal(res_coef, exp_coef, | ||
check_exact=False, | ||
check_less_precise=True) | ||
# Double check to make sure the fit is perfect | ||
self.assertAlmostEqual(res.r2, 1) | ||
|
||
# Double check to make sure residuals are zero | ||
exp_resid = pd.DataFrame([[0., 0.], | ||
[0., 0.], | ||
[0., 0.], | ||
[0., 0.], | ||
[0., 0.]], | ||
index=['s1', 's2', 's3', 's4', 's5'], | ||
columns=['y0', 'y1']) | ||
pdt.assert_frame_equal(exp_resid, res.residuals()) | ||
|
||
def test_ols_immutable(self): | ||
A = np.array # aliasing for the sake of pep8 | ||
table = pd.DataFrame({ | ||
's1': ilr_inv(A([1., 1.])), | ||
's2': ilr_inv(A([1., 2.])), | ||
's3': ilr_inv(A([1., 3.])), | ||
's4': ilr_inv(A([1., 4.])), | ||
's5': ilr_inv(A([1., 5.])), | ||
's6': ilr_inv(A([1., 5.]))}, | ||
index=['a', 'b', 'c']).T | ||
exp_table = pd.DataFrame({ | ||
's1': ilr_inv(A([1., 1.])), | ||
's2': ilr_inv(A([1., 2.])), | ||
's3': ilr_inv(A([1., 3.])), | ||
's4': ilr_inv(A([1., 4.])), | ||
's5': ilr_inv(A([1., 5.])), | ||
's6': ilr_inv(A([1., 5.]))}, | ||
index=['a', 'b', 'c']).T | ||
|
||
tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) | ||
exp_tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) | ||
metadata = pd.DataFrame({ | ||
'lame': [1, 1, 1, 1, 1], | ||
'real': [1, 2, 3, 4, 5] | ||
}, index=['s1', 's2', 's3', 's4', 's5']) | ||
|
||
ols('real + lame', table, metadata, tree) | ||
self.assertEqual(str(table), str(exp_table)) | ||
self.assertEqual(str(exp_tree), str(tree)) | ||
|
||
def test_ols_empty_table(self): | ||
A = np.array # aliasing for the sake of pep8 | ||
table = pd.DataFrame({ | ||
's1': ilr_inv(A([1., 1.])), | ||
's2': ilr_inv(A([1., 2.])), | ||
's3': ilr_inv(A([1., 3.])), | ||
's4': ilr_inv(A([1., 4.])), | ||
's5': ilr_inv(A([1., 5.])), | ||
's6': ilr_inv(A([1., 5.]))}, | ||
index=['x', 'y', 'z']).T | ||
|
||
tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) | ||
metadata = pd.DataFrame({ | ||
'lame': [1, 1, 1, 1, 1], | ||
'real': [1, 2, 3, 4, 5] | ||
}, index=['s1', 's2', 's3', 's4', 's5']) | ||
with self.assertRaises(ValueError): | ||
ols('real + lame', table, metadata, tree) | ||
|
||
def test_ols_empty_metadata(self): | ||
A = np.array # aliasing for the sake of pep8 | ||
table = pd.DataFrame({ | ||
'k1': ilr_inv(A([1., 1.])), | ||
'k2': ilr_inv(A([1., 2.])), | ||
'k3': ilr_inv(A([1., 3.])), | ||
'k4': ilr_inv(A([1., 4.])), | ||
'k5': ilr_inv(A([1., 5.])), | ||
'k6': ilr_inv(A([1., 5.]))}, | ||
index=['a', 'b', 'c']).T | ||
|
||
tree = TreeNode.read(['((c,d),(b,a)Y2)Y1;']) | ||
metadata = pd.DataFrame({ | ||
'lame': [1, 1, 1, 1, 1], | ||
'real': [1, 2, 3, 4, 5] | ||
}, index=['s1', 's2', 's3', 's4', 's5']) | ||
with self.assertRaises(ValueError): | ||
ols('real + lame', table, metadata, tree) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters