Skip to content

Commit

Permalink
fastuni code add
Browse files Browse the repository at this point in the history
  • Loading branch information
cameronmartino committed Jun 3, 2020
1 parent 7d685fd commit d9a09c0
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 7 deletions.
2 changes: 1 addition & 1 deletion deicode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
#
# ----------------------------------------------------------------------------

__version__ = "0.2.4"
__version__ = "0.2.5"
6 changes: 4 additions & 2 deletions deicode/matrix_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class MatrixCompletion(_BaseImpute):

def __init__(self, n_components=2, max_iterations=5, tol=1e-5):
def __init__(self, n_components=2, max_iterations=5, branch_lengths=1, tol=1e-5):
"""
This form of matrix completion uses OptSpace (1). Furthermore,
Expand Down Expand Up @@ -80,6 +80,7 @@ def __init__(self, n_components=2, max_iterations=5, tol=1e-5):

self.n_components = n_components
self.max_iterations = max_iterations
self.branch_lengths = branch_lengths
self.tol = tol

return
Expand Down Expand Up @@ -130,7 +131,8 @@ def _fit(self):
# return solved matrix
self.U, self.s, self.V = OptSpace(n_components=self.n_components,
max_iterations=self.max_iterations,
tol=self.tol).solve(X_sparse)
tol=self.tol,
branch_lengths = self.branch_lengths).solve(X_sparse)
# save the solution (of the imputation)
self.solution = self.U.dot(self.s).dot(self.V.T)
self.eigenvalues = np.diag(self.s)
Expand Down
6 changes: 4 additions & 2 deletions deicode/optspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
n_components,
max_iterations,
tol,
branch_lengths = 1,
step_size=10000,
resolution_limit=20,
sign=-1):
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(
self.n_components = n_components
self.max_iterations = max_iterations
self.tol = tol
self.branch_lengths = branch_lengths
self.step_size = step_size
self.resolution_limit = resolution_limit
self.sign = sign
Expand Down Expand Up @@ -155,7 +157,7 @@ def solve(self, obs):
# initialize the distortion matrix of line above
dist = np.zeros(self.max_iterations + 1)
# starting initialization of the distortion between obs and imputed
dist[0] = norm(np.multiply(obs_error, mask), 'fro') / \
dist[0] = norm(np.multiply(obs_error, mask) + np.log(self.branch_lengths + 1.0), 'fro') / \
np.sqrt(total_nonzeros)
# we will perform gradient decent for at most self.max_iterations
for i in range(1, self.max_iterations):
Expand Down Expand Up @@ -183,7 +185,7 @@ def solve(self, obs):
# Compute the distortion
obs_error = obs - U.dot(S).dot(V.T)
# update the new distortion
dist[i + 1] = norm(np.multiply(obs_error, mask),
dist[i + 1] = norm(np.multiply(obs_error, mask) + np.log(self.branch_lengths + 1.0),
'fro') / np.sqrt(total_nonzeros)
# if the gradient decent has coverged then break the loop
# and return the results
Expand Down
97 changes: 97 additions & 0 deletions deicode/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
from skbio.diversity._phylogenetic import _nodes_by_counts
from skbio.diversity._util import _vectorize_counts_and_tree
from skbio.stats.composition import closure
# need to ignore log of zero warning
np.seterr(all='ignore')
Expand Down Expand Up @@ -85,3 +87,98 @@ def rclr(mat):
# mask the missing with nan
lmat[~np.isfinite(mat)] = np.nan
return lmat


def phylo_rclr(mat, branch_lengths):
"""
The rclr procedure first log transform
the nonzero values before centering the data
we refer to this preprocessing procedure as
the robust center log-ratio (rclr) (1) due to its
ties to the clr (2) transform commonly used in
compositional data analysis.
Parameters
----------
mat : array_like, float
a matrix of counts where
rows = components and
columns = samples
Returns
-------
numpy.ndarray
rclr transformed matrix
Raises
------
ValueError
Raises an error if values in array are negative
ValueError
Data-mat contains either np.inf or -np.inf
ValueError
Data-mat contains nans
References
----------
.. [1] Martino, Cameron, James T. Morton, Clarisse A. Marotz,
Luke R. Thompson, Anupriya Tripathi, Rob Knight, and
Karsten Zengler. 2019. “A Novel Sparse Compositional
Technique Reveals Microbial Perturbations.”
Edited by Josh D. Neufeld. mSystems 4 (1).
https://doi.org/10.1128/mSystems.00016-19.
.. [2] Pawlowsky-Glahn, Vera, Juan José Egozcue, and
Raimon Tolosana-Delgado. 2015. Modeling and
Analysis of Compositional Data. John Wiley & Sons.
Examples
--------
>>> import numpy as np
>>> from deicode.preprocessing import rclr
>>> x = np.array([[1, 3, 4, 2, 0],
[4, 0, 1, 2, 5]])
>>> rclr(x)
array([[-0.79, 0.3, 0.59, -0.1, nan],
[0.46, nan, -0.92, -0.23, 0.69]])
"""

# ensure array is at leadt 2D
mat = np.atleast_2d(np.array(mat))
# ensure no missing values
if (mat < 0).any():
raise ValueError('Array Contains Negative Values')
# ensure no undefined values
if np.count_nonzero(np.isinf(mat)) != 0:
raise ValueError('Data-mat contains either np.inf or -np.inf')
# ensure no missing values
if np.count_nonzero(np.isnan(mat)) != 0:
raise ValueError('Data-mat contains nans')
# take the log of the sample centered data
mat = np.log(closure(closure(mat) * branch_lengths))
# generate a mask of missing values
mask = [True] * mat.shape[0] * mat.shape[1]
mask = np.array(mat).reshape(mat.shape)
mask[np.isfinite(mat)] = False
# sum of rows (features)
lmat = np.ma.array(mat, mask=mask)
# perfrom geometric mean
gm = lmat.mean(axis=-1, keepdims=True)
# center with the geometric mean
lmat = (lmat - gm).squeeze().data
# mask the missing with nan
lmat[~np.isfinite(mat)] = np.nan
return lmat


def fast_unifrac(bt, tree, min_feature_frequency):

# built table
bt_phylo = bt.copy()
bt_array = bt_phylo.matrix_data.toarray()
outs_id = bt_phylo.ids('observation')
# flatten
counts_by_node, tree_index, branch_lengths \
= _vectorize_counts_and_tree(bt_array.T, outs_id, tree)
# drop zero sums
keep_zero = counts_by_node.sum(0) > 0
counts_by_node = counts_by_node[:, keep_zero]
branch_lengths = branch_lengths[keep_zero]
tids = ['o'+i for i in list(tree_index['id'][keep_zero].astype(str))]
tree_index['keep'] = {i:v for i, v in enumerate(keep_zero)}

return counts_by_node, tree_index, branch_lengths, tids
109 changes: 107 additions & 2 deletions deicode/rpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import numpy as np
import pandas as pd
from typing import Union
from skbio import TreeNode
from deicode.matrix_completion import MatrixCompletion
from deicode.preprocessing import rclr
from deicode.preprocessing import rclr, phylo_rclr, fast_unifrac
from deicode._rpca_defaults import (DEFAULT_RANK, DEFAULT_MSC, DEFAULT_MFC,
DEFAULT_ITERATIONS, DEFAULT_MFF)
from scipy.linalg import svd
Expand Down Expand Up @@ -42,7 +43,10 @@ def frequency_filter(val, id_, md):
table = table.filter(observation_filter, axis='observation')
table = table.filter(frequency_filter, axis='observation')
table = table.filter(sample_filter, axis='sample')
table = table.to_dataframe().T
table = pd.DataFrame(table.matrix_data.toarray(),
table.ids('observation'),
table.ids()).T
#table = table.to_dataframe().T
# check the table after filtering
if len(table.index) != len(set(table.index)):
raise ValueError('Data-table contains duplicate indices')
Expand Down Expand Up @@ -124,3 +128,104 @@ def auto_rpca(table: biom.Table,
min_feature_frequency=min_feature_frequency,
max_iterations=max_iterations)
return ord_res, dist_res


def phylo_rpca(table: biom.Table,
tree: TreeNode,
n_components: Union[int, str] = DEFAULT_RANK,
min_sample_count: int = DEFAULT_MSC,
min_feature_count: int = DEFAULT_MFC,
min_feature_frequency: float = DEFAULT_MFF,
max_iterations: int = DEFAULT_ITERATIONS) -> (
skbio.OrdinationResults,
skbio.DistanceMatrix):
"""Runs RPCA with an rclr preprocessing step.
This code will be run by both the standalone and QIIME 2 versions of
DEICODE.
"""
# get shape of table
n_features, n_samples = table.shape

# filter sample to min seq. depth
def sample_filter(val, id_, md):
return sum(val) > min_sample_count

# filter features to min total counts
def observation_filter(val, id_, md):
return sum(val) > min_feature_count

# filter features by N samples presence
def frequency_filter(val, id_, md):
return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)

# filter and import table for each filter above
table = table.filter(observation_filter, axis='observation')
table = table.filter(sample_filter, axis='sample')
table = table.filter(frequency_filter, axis='observation')
# check the table after filtering
if len(table.ids()) != len(set(table.ids())):
raise ValueError('Data-table contains duplicate indices')
if len(table.ids('observation')) != len(set(table.ids('observation'))):
raise ValueError('Data-table contains duplicate columns')
# built table
counts_by_node, tree_index, branch_lengths, tids\
= fast_unifrac(table, tree, frequency_filter)
rclr_table = phylo_rclr(counts_by_node, branch_lengths)
# Robust-clt (rclr) preprocessing and OptSpace (RPCA)
opt = MatrixCompletion(n_components=n_components,
max_iterations=max_iterations,
branch_lengths=branch_lengths).fit(rclr_table)
# get new n-comp when applicable
n_components = opt.s.shape[0]
# get PC column labels for the skbio OrdinationResults
rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
# get completed matrix for centering
X = opt.sample_weights @ opt.s @ opt.feature_weights.T
# center again around zero after completion
X = X - X.mean(axis=0)
X = X - X.mean(axis=1).reshape(-1, 1)
# re-factor the data
u, s, v = svd(X)
# only take n-components
u = u[:, :n_components]
v = v.T[:, :n_components]
# calc. the new variance using projection
p = s**2 / np.sum(s**2)
p = p[:n_components]
s = s[:n_components]
# save the loadings
feature_loading = pd.DataFrame(v, index=tids,
columns=rename_cols)
sample_loading = pd.DataFrame(u, index=table.ids(),
columns=rename_cols)
# % var explained
proportion_explained = pd.Series(p, index=rename_cols)
# get eigenvalues
eigvals = pd.Series(s, index=rename_cols)

# if the n_components is two add PC3 of zeros
# this is referenced as in issue in
# <https://github.com/biocore/emperor/commit
# /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
# discussed in DEICODE -- PR#29
if n_components == 2:
feature_loading['PC3'] = [0] * len(feature_loading.index)
sample_loading['PC3'] = [0] * len(sample_loading.index)
eigvals.loc['PC3'] = 0
proportion_explained.loc['PC3'] = 0

# save ordination results
short_method_name = 'rpca_biplot'
long_method_name = '(Robust Aitchison) RPCA Biplot'
ord_res = skbio.OrdinationResults(
short_method_name,
long_method_name,
eigvals.copy(),
samples=sample_loading.copy(),
features=feature_loading.copy(),
proportion_explained=proportion_explained.copy())
# save distance matrix
dist_res = skbio.stats.distance.DistanceMatrix(
opt.distance, ids=sample_loading.index)

return ord_res, dist_res

0 comments on commit d9a09c0

Please sign in to comment.