fastuni code add

biocore · Jun 3, 2020 · d9a09c0 · d9a09c0
1 parent 7d685fd
commit d9a09c0
Show file tree

Hide file tree

Showing 5 changed files with 213 additions and 7 deletions.
diff --git a/deicode/__init__.py b/deicode/__init__.py
@@ -8,4 +8,4 @@
 #
 # ----------------------------------------------------------------------------
 
-__version__ = "0.2.4"
+__version__ = "0.2.5"
diff --git a/deicode/matrix_completion.py b/deicode/matrix_completion.py
@@ -6,7 +6,7 @@
 
 class MatrixCompletion(_BaseImpute):
 
-    def __init__(self, n_components=2, max_iterations=5, tol=1e-5):
+    def __init__(self, n_components=2, max_iterations=5, branch_lengths=1, tol=1e-5):
         """
 
         This form of matrix completion uses OptSpace (1). Furthermore,
@@ -80,6 +80,7 @@ def __init__(self, n_components=2, max_iterations=5, tol=1e-5):
 
         self.n_components = n_components
         self.max_iterations = max_iterations
+        self.branch_lengths = branch_lengths
         self.tol = tol
 
         return
@@ -130,7 +131,8 @@ def _fit(self):
         # return solved matrix
         self.U, self.s, self.V = OptSpace(n_components=self.n_components,
                                           max_iterations=self.max_iterations,
-                                          tol=self.tol).solve(X_sparse)
+                                          tol=self.tol,
+                                          branch_lengths = self.branch_lengths).solve(X_sparse)
         # save the solution (of the imputation)
         self.solution = self.U.dot(self.s).dot(self.V.T)
         self.eigenvalues = np.diag(self.s)

diff --git a/deicode/optspace.py b/deicode/optspace.py
@@ -28,6 +28,7 @@ def __init__(
             n_components,
             max_iterations,
             tol,
+            branch_lengths = 1,
             step_size=10000,
             resolution_limit=20,
             sign=-1):
@@ -93,6 +94,7 @@ def __init__(
         self.n_components = n_components
         self.max_iterations = max_iterations
         self.tol = tol
+        self.branch_lengths = branch_lengths
         self.step_size = step_size
         self.resolution_limit = resolution_limit
         self.sign = sign
@@ -155,7 +157,7 @@ def solve(self, obs):
         # initialize the distortion matrix of line above
         dist = np.zeros(self.max_iterations + 1)
         # starting initialization of the distortion between obs and imputed
-        dist[0] = norm(np.multiply(obs_error, mask), 'fro') / \
+        dist[0] = norm(np.multiply(obs_error, mask) + np.log(self.branch_lengths + 1.0), 'fro') / \
             np.sqrt(total_nonzeros)
         # we will perform gradient decent for at most self.max_iterations
         for i in range(1, self.max_iterations):
@@ -183,7 +185,7 @@ def solve(self, obs):
             # Compute the distortion
             obs_error = obs - U.dot(S).dot(V.T)
             # update the new distortion
-            dist[i + 1] = norm(np.multiply(obs_error, mask),
+            dist[i + 1] = norm(np.multiply(obs_error, mask) + np.log(self.branch_lengths + 1.0),
                                'fro') / np.sqrt(total_nonzeros)
             # if the gradient decent has coverged then break the loop
             # and return the results

diff --git a/deicode/preprocessing.py b/deicode/preprocessing.py
@@ -1,4 +1,6 @@
 import numpy as np
+from skbio.diversity._phylogenetic import _nodes_by_counts
+from skbio.diversity._util import _vectorize_counts_and_tree
 from skbio.stats.composition import closure
 # need to ignore log of zero warning
 np.seterr(all='ignore')
@@ -85,3 +87,98 @@ def rclr(mat):
     # mask the missing with nan
     lmat[~np.isfinite(mat)] = np.nan
     return lmat
+
+
+def phylo_rclr(mat, branch_lengths):
+    """
+    The rclr procedure first log transform
+    the nonzero values before centering the data
+    we refer to this preprocessing procedure as
+    the robust center log-ratio (rclr) (1) due to its
+    ties to the clr (2) transform commonly used in
+    compositional data analysis.
+    Parameters
+    ----------
+    mat : array_like, float
+       a matrix of counts where
+       rows = components and
+       columns = samples
+    Returns
+    -------
+    numpy.ndarray
+        rclr transformed matrix
+    Raises
+    ------
+    ValueError
+        Raises an error if values in array are negative
+    ValueError
+        Data-mat contains either np.inf or -np.inf
+    ValueError
+        Data-mat contains nans
+    References
+    ----------
+    .. [1] Martino, Cameron, James T. Morton, Clarisse A. Marotz,
+           Luke R. Thompson, Anupriya Tripathi, Rob Knight, and
+           Karsten Zengler. 2019. “A Novel Sparse Compositional
+           Technique Reveals Microbial Perturbations.”
+           Edited by Josh D. Neufeld. mSystems 4 (1).
+           https://doi.org/10.1128/mSystems.00016-19.
+    .. [2] Pawlowsky-Glahn, Vera, Juan José Egozcue, and
+           Raimon Tolosana-Delgado. 2015. Modeling and
+           Analysis of Compositional Data. John Wiley & Sons.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from deicode.preprocessing import rclr
+    >>> x = np.array([[1, 3, 4, 2, 0],
+              [4, 0, 1, 2, 5]])
+    >>> rclr(x)
+    array([[-0.79, 0.3, 0.59, -0.1, nan],
+           [0.46, nan, -0.92, -0.23, 0.69]])
+    """
+
+    # ensure array is at leadt 2D
+    mat = np.atleast_2d(np.array(mat))
+    # ensure no missing values
+    if (mat < 0).any():
+        raise ValueError('Array Contains Negative Values')
+    # ensure no undefined values
+    if np.count_nonzero(np.isinf(mat)) != 0:
+        raise ValueError('Data-mat contains either np.inf or -np.inf')
+    # ensure no missing values
+    if np.count_nonzero(np.isnan(mat)) != 0:
+        raise ValueError('Data-mat contains nans')
+    # take the log of the sample centered data
+    mat = np.log(closure(closure(mat) * branch_lengths))
+    # generate a mask of missing values
+    mask = [True] * mat.shape[0] * mat.shape[1]
+    mask = np.array(mat).reshape(mat.shape)
+    mask[np.isfinite(mat)] = False
+    # sum of rows (features)
+    lmat = np.ma.array(mat, mask=mask)
+    # perfrom geometric mean
+    gm = lmat.mean(axis=-1, keepdims=True)
+    # center with the geometric mean
+    lmat = (lmat - gm).squeeze().data
+    # mask the missing with nan
+    lmat[~np.isfinite(mat)] = np.nan
+    return lmat
+
+
+def fast_unifrac(bt, tree, min_feature_frequency):
+
+    # built table
+    bt_phylo = bt.copy()
+    bt_array = bt_phylo.matrix_data.toarray()
+    outs_id = bt_phylo.ids('observation')
+    # flatten
+    counts_by_node, tree_index, branch_lengths \
+        = _vectorize_counts_and_tree(bt_array.T, outs_id, tree)
+    # drop zero sums
+    keep_zero = counts_by_node.sum(0) > 0 
+    counts_by_node = counts_by_node[:, keep_zero]
+    branch_lengths = branch_lengths[keep_zero]
+    tids = ['o'+i for i in list(tree_index['id'][keep_zero].astype(str))]
+    tree_index['keep'] = {i:v for i, v in enumerate(keep_zero)}
+
+    return counts_by_node, tree_index, branch_lengths, tids
diff --git a/deicode/rpca.py b/deicode/rpca.py
@@ -3,8 +3,9 @@
 import numpy as np
 import pandas as pd
 from typing import Union
+from skbio import TreeNode
 from deicode.matrix_completion import MatrixCompletion
-from deicode.preprocessing import rclr
+from deicode.preprocessing import rclr, phylo_rclr, fast_unifrac
 from deicode._rpca_defaults import (DEFAULT_RANK, DEFAULT_MSC, DEFAULT_MFC,
                                     DEFAULT_ITERATIONS, DEFAULT_MFF)
 from scipy.linalg import svd
@@ -42,7 +43,10 @@ def frequency_filter(val, id_, md):
     table = table.filter(observation_filter, axis='observation')
     table = table.filter(frequency_filter, axis='observation')
     table = table.filter(sample_filter, axis='sample')
-    table = table.to_dataframe().T
+    table = pd.DataFrame(table.matrix_data.toarray(),
+                         table.ids('observation'),
+                         table.ids()).T
+    #table = table.to_dataframe().T
     # check the table after filtering
     if len(table.index) != len(set(table.index)):
         raise ValueError('Data-table contains duplicate indices')
@@ -124,3 +128,104 @@ def auto_rpca(table: biom.Table,
                              min_feature_frequency=min_feature_frequency,
                              max_iterations=max_iterations)
     return ord_res, dist_res
+
+
+def phylo_rpca(table: biom.Table,
+               tree: TreeNode,
+               n_components: Union[int, str] = DEFAULT_RANK,
+               min_sample_count: int = DEFAULT_MSC,
+               min_feature_count: int = DEFAULT_MFC,
+               min_feature_frequency: float = DEFAULT_MFF,
+               max_iterations: int = DEFAULT_ITERATIONS) -> (
+               skbio.OrdinationResults,
+               skbio.DistanceMatrix):
+    """Runs RPCA with an rclr preprocessing step.
+       This code will be run by both the standalone and QIIME 2 versions of
+       DEICODE.
+    """
+    # get shape of table
+    n_features, n_samples = table.shape
+
+    # filter sample to min seq. depth
+    def sample_filter(val, id_, md):
+        return sum(val) > min_sample_count
+
+    # filter features to min total counts
+    def observation_filter(val, id_, md):
+        return sum(val) > min_feature_count
+
+    # filter features by N samples presence
+    def frequency_filter(val, id_, md):
+        return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)
+
+    # filter and import table for each filter above
+    table = table.filter(observation_filter, axis='observation')
+    table = table.filter(sample_filter, axis='sample')
+    table = table.filter(frequency_filter, axis='observation')
+    # check the table after filtering
+    if len(table.ids()) != len(set(table.ids())):
+        raise ValueError('Data-table contains duplicate indices')
+    if len(table.ids('observation')) != len(set(table.ids('observation'))):
+        raise ValueError('Data-table contains duplicate columns')
+    # built table
+    counts_by_node, tree_index, branch_lengths, tids\
+        = fast_unifrac(table, tree, frequency_filter)
+    rclr_table = phylo_rclr(counts_by_node, branch_lengths)
+    # Robust-clt (rclr) preprocessing and OptSpace (RPCA)
+    opt = MatrixCompletion(n_components=n_components,
+                           max_iterations=max_iterations,
+                           branch_lengths=branch_lengths).fit(rclr_table)
+    # get new n-comp when applicable
+    n_components = opt.s.shape[0]
+    # get PC column labels for the skbio OrdinationResults
+    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
+    # get completed matrix for centering
+    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
+    # center again around zero after completion
+    X = X - X.mean(axis=0)
+    X = X - X.mean(axis=1).reshape(-1, 1)
+    # re-factor the data
+    u, s, v = svd(X)
+    # only take n-components
+    u = u[:, :n_components]
+    v = v.T[:, :n_components]
+    # calc. the new variance using projection
+    p = s**2 / np.sum(s**2)
+    p = p[:n_components]
+    s = s[:n_components]
+    # save the loadings
+    feature_loading = pd.DataFrame(v, index=tids,
+                                   columns=rename_cols)
+    sample_loading = pd.DataFrame(u, index=table.ids(),
+                                  columns=rename_cols)
+    # % var explained
+    proportion_explained = pd.Series(p, index=rename_cols)
+    # get eigenvalues
+    eigvals = pd.Series(s, index=rename_cols)
+
+    # if the n_components is two add PC3 of zeros
+    # this is referenced as in issue in
+    # <https://github.com/biocore/emperor/commit
+    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
+    # discussed in DEICODE -- PR#29
+    if n_components == 2:
+        feature_loading['PC3'] = [0] * len(feature_loading.index)
+        sample_loading['PC3'] = [0] * len(sample_loading.index)
+        eigvals.loc['PC3'] = 0
+        proportion_explained.loc['PC3'] = 0
+
+    # save ordination results
+    short_method_name = 'rpca_biplot'
+    long_method_name = '(Robust Aitchison) RPCA Biplot'
+    ord_res = skbio.OrdinationResults(
+        short_method_name,
+        long_method_name,
+        eigvals.copy(),
+        samples=sample_loading.copy(),
+        features=feature_loading.copy(),
+        proportion_explained=proportion_explained.copy())
+    # save distance matrix
+    dist_res = skbio.stats.distance.DistanceMatrix(
+        opt.distance, ids=sample_loading.index)
+
+    return ord_res, dist_res