Merge branch 'master' of github.com:biocore/mds-approximations into t…

…est_data_file_cleanup
biocore · Oct 3, 2019 · 9aa2c90 · 9aa2c90
2 parents 53b35c8 + 6f52697
commit 9aa2c90
Show file tree

Hide file tree

Showing 10 changed files with 198 additions and 104 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+bayesian-regression/
+
 # OS generated files
 .DS_Store
 .DS_Store?
@@ -53,4 +55,4 @@ pip-log.txt
 # Generated PCoA output of algorithms
 out/*
 
-.env
+.env
diff --git a/.travis.yml b/.travis.yml
@@ -3,23 +3,23 @@
 sudo: false
 language: python
 env:
-  - PYTHON_VERSION=3
+  - PYTHON_VERSION=3.6
 before_install:
-  - wget http://repo.continuum.io/miniconda/Miniconda3-3.7.3-Linux-x86_64.sh -O miniconda.sh
+  - wget http://repo.continuum.io/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh -O miniconda.sh
   - chmod +x miniconda.sh
   - ./miniconda.sh -b
   - export PATH=/home/travis/miniconda3/bin:$PATH
   # Update conda itself
   - conda update --yes conda
 install:
-  - conda env create -n mds-approximations -f environment.yml
-  - conda install --yes cython nose pep8 flake8 pip
-  #- if [ ${USE_CYTHON} ]; then conda install --yes -n mds-approximations cython; fi
+  - conda env create -n mds-approximations -f environment.yml python=3.6
   - source activate mds-approximations
+  - conda install --yes cython nose pep8 flake8 pip
+  # - if [ ${USE_CYTHON} ]; then conda install --yes cython; fi
   - pip install .
   - pip install coveralls
 script:
   - nosetests --with-coverage --cover-package=mdsa
   - flake8 setup.py mdsa
 after_success:
-  - coveralls
+  - coveralls
diff --git a/bayesian_regression-0.1-py3-none-any.whl b/bayesian_regression-0.1-py3-none-any.whl
diff --git a/environment.yml b/environment.yml
@@ -6,12 +6,12 @@ channels:
 - bioconda
 - biocore
 dependencies:
-- scikit-bio=0.4.2
-- click=6.7
-- python=3
+- scikit-bio
+- click
 - numpy
-- scikit-learn=0.19.1
+- scikit-learn
 - scipy
 - pandas
 - jupyter
 - matplotlib
+- python=3.7
diff --git a/mdsa/algorithms/fsvd.py b/mdsa/algorithms/fsvd.py
@@ -1,6 +1,5 @@
-from numpy import dot, hstack
-from numpy.linalg import qr, svd
-from numpy.random import standard_normal
+from skbio import DistanceMatrix
+from skbio.stats.ordination import pcoa as skbio_pcoa
 
 from mdsa.algorithm import Algorithm
 
@@ -47,80 +46,7 @@ def run(self, distance_matrix, num_dimensions_out=10,
         """
         super(FSVD, self).run(distance_matrix, num_dimensions_out)
 
-        m, n = distance_matrix.shape
-
-        # Note: this transpose is removed for performance, since we
-        # only expect square matrices.
-        # Take (conjugate) transpose if necessary, because it makes H smaller,
-        # leading to faster computations
-        # if m < n:
-        #     distance_matrix = distance_matrix.transpose()
-        #     m, n = distance_matrix.shape
-        if m != n:
-            raise ValueError('FSVD.run(...) expects square distance matrix')
-
-        k = num_dimensions_out + 2
-
-        # Form a real nxl matrix G whose entries are independent,
-        # identically distributed
-        # Gaussian random variables of
-        # zero mean and unit variance
-        G = standard_normal(size=(n, k))
-
-        if use_power_method:
-            # use only the given exponent
-            H = dot(distance_matrix, G)
-
-            for x in range(2, num_levels + 2):
-                # enhance decay of singular values
-                # note: distance_matrix is no longer transposed, saves work
-                # since we're expecting symmetric, square matrices anyway
-                # (Daniel McDonald's changes)
-                H = dot(distance_matrix, dot(distance_matrix, H))
-
-        else:
-            # compute the m x l matrices H^{(0)}, ..., H^{(i)}
-            # Note that this is done implicitly in each iteration below.
-            H = dot(distance_matrix, G)
-            # Again, removed transpose: dot(distance_matrix.transpose(), H)
-            # to enhance performance
-            H = hstack(
-                (H, dot(distance_matrix, dot(distance_matrix, H))))
-            for x in range(3, num_levels + 2):
-                # Removed this transpose: dot(distance_matrix.transpose(), H)
-                tmp = dot(distance_matrix, dot(distance_matrix, H))
-
-                # Removed this transpose: dot(distance_matrix.transpose(), tmp)
-                H = hstack(
-                    (H, dot(distance_matrix, dot(distance_matrix, tmp))))
-
-        # Using the pivoted QR-decomposiion, form a real m * ((i+1)l) matrix Q
-        # whose columns are orthonormal, s.t. there exists a real
-        # ((i+1)l) * ((i+1)l) matrix R for which H = QR
-        Q, R = qr(H)
-
-        # Compute the n * ((i+1)l) product matrix T = A^T Q
-        # Removed transpose of distance_matrix for performance
-        T = dot(distance_matrix, Q)  # step 3
-
-        # Form an SVD of T
-        Vt, St, W = svd(T, full_matrices=False)
-        W = W.transpose()
-
-        # Compute the m * ((i+1)l) product matrix
-        Ut = dot(Q, W)
-
-        if m < n:
-            # V_fsvd = Ut[:, :num_dimensions_out] # unused
-            U_fsvd = Vt[:, :num_dimensions_out]
-        else:
-            # V_fsvd = Vt[:, :num_dimensions_out] # unused
-            U_fsvd = Ut[:, :num_dimensions_out]
-
-        S = St[:num_dimensions_out] ** 2
-
-        # drop imaginary component, if we got one
-        eigenvalues = S.real
-        eigenvectors = U_fsvd.real
-
-        return eigenvectors, eigenvalues
+        results = skbio_pcoa(DistanceMatrix(distance_matrix),
+                             method='fsvd',
+                             number_of_dimensions=num_dimensions_out)
+        return results.samples.values, results.eigvals
diff --git a/mdsa/algorithms/nystrom.py b/mdsa/algorithms/nystrom.py
@@ -40,15 +40,15 @@
 
         /  E  |  F \
     D = |-----|----|
-        \ F.t |  G /
+        \\ F.t |  G /
 
 
 The correspondong association matrix or centered inner-product
 matrix K is:
 
         /  A  |  B \
     K = |-----|----|
-        \ B.t |  C /
+        \\ B.t |  C /
 
 where A and B are computed as follows
 

diff --git a/mdsa/algorithms/scmds.py b/mdsa/algorithms/scmds.py
@@ -220,8 +220,8 @@ def affine_mapping(self, matrix_x, matrix_y):
 
         Affine mapping function:
         Y = UX + kron(b,ones(1,n)), UU' = I
-        X = [x_1, x_2, ... , x_n]; x_j \in R^m
-        Y = [y_1, y_2, ... , y_n]; y_j \in R^m
+        X = [x_1, x_2, ... , x_n]; x_j \\in R^m
+        Y = [y_1, y_2, ... , y_n]; y_j \\in R^m
 
         From Tzeng 2008:
         `The projection of xi,2 from q2 dimension to q1 dimension

diff --git a/mdsa/pcoa.py b/mdsa/pcoa.py
@@ -64,7 +64,7 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
        If the distance is not euclidean (for example if it is a
        semimetric and the triangle inequality doesn't hold),
        negative eigenvalues can appear. There are different ways
-       to deal with that problem (see Legendre & Legendre 1998, \S
+       to deal with that problem (see Legendre & Legendre 1998, \\S
        9.2.3), but none are currently implemented here.
        However, a warning is raised whenever negative eigenvalues
        appear, allowing the user to decide if they can be safely
@@ -90,10 +90,6 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
     eigenvectors = np.array(eigenvectors)
     eigenvalues = np.array(eigenvalues)
 
-    # Ensure eigenvectors are normalized
-    eigenvectors = np.apply_along_axis(lambda vec: vec / np.linalg.norm(vec),
-                                       axis=1, arr=eigenvectors)
-
     # Generate axis labels for output
     axis_labels = ['PC%d' % i for i in range(1, len(eigenvectors) + 1)]
     axis_labels = axis_labels[:num_dimensions_out]
@@ -170,6 +166,13 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
         # operation.
         eigenvectors = eigenvectors * np.sqrt(eigenvalues)
 
+        # Calculate the array of proportion of variance explained
+        # proportion_explained = eigenvalues / eigenvalues.sum()
+        sum_eigenvalues = np.trace(centered_dm)
+
+        # Calculate the array of proportion of variance explained
+        proportion_explained = eigenvalues / sum_eigenvalues
+
         # Now remove the dimensions with the least information
         # Only select k (num_dimensions_out) first eigenvectors
         # and their corresponding eigenvalues from the sorted array
@@ -178,9 +181,7 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
         if len(eigenvalues) > num_dimensions_out:
             eigenvectors = eigenvectors[:, :num_dimensions_out]
             eigenvalues = eigenvalues[:num_dimensions_out]
-
-        # Calculate the array of proportion of variance explained
-        proportion_explained = eigenvalues / eigenvalues.sum()
+            proportion_explained = proportion_explained[:num_dimensions_out]
 
         return OrdinationResults(
             short_method_name='PCoA',

diff --git a/scripts/randdm b/scripts/randdm
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+
+import os
+
+import click
+import numpy as np
+from skbio import DistanceMatrix
+from skbio.stats.distance import randdm
+from bayesian_regression.util.generators import band_table, block_table
+from skbio.diversity import beta_diversity
+
+
+@click.command()
+@click.option('--structure', 'structure', type=str,
+              help='Optional. Generate structure in the random data that '
+                   'models distributions of microbial communities. '
+                   'Options: "band" or "block". If not specified, then '
+                   'draws from gaussian distribution to generate distance '
+                   'matrices.', default=None)
+@click.option('--structure-distance-metric', 'distance_metric', type=str,
+              help='Distance metric to use when generating distance matrix'
+                   ' using the --structure flag. "braycurtis" by default',
+              default='braycurtis')
+@click.option('--dim', 'dimensions', type=int, multiple=True,
+              help='Dimension of the input matrix/matrices to generate')
+@click.option('--seed', 'seed', type=int,
+              help='Random number generator seed value.')
+@click.option('--sub', 'subsample_dims', type=str, multiple=True,
+              help='Generate subsampled distance matrix with given'
+                   'dimension (must be smaller than original dimension)'
+                   ' for each randomly generated distance matrix.'
+                   ' Can specify as int or percentage')
+@click.option('--overwrite', 'overwrite', type=bool, default=False,
+              help='Overwrite output directory if it already exists. Do not '
+                   'overwrite, by default.')
+@click.argument('output_dir', type=click.Path(), default=None)
+def generate(dimensions, output_dir, seed, distance_metric, subsample_dims,
+             structure, overwrite):
+    """
+    Generate random distance matrix. By default, generates a random distance
+    matrix drawing from a gaussian probability distribution. Since realistic
+    microbiome OTU tables and distance matrices typically do not follow such
+    a probability distribution, as they often contain "structure", i.e.
+    discrete block-like patterns or patterns along a gradient, another
+    option is provided to generate more realistic microbiome data that mimics
+    block or band-like structure, e.g. block-like could represent one grou of
+    microbes associated with a diseased state and another set associated with a
+    healthy state.
+
+    Parameters
+    ----------
+    dimensions : number
+        Dimension of the input matrix/matrices to generate.
+    output_dir : str
+        Path to the directory to output results to.
+    seed : number
+        Random seed for generating distance matrices. Optional.
+    distance_metric : str
+        Distance metric to use when generating distance matrix
+        using the --structure flag. "braycurtis" by default.
+        See skbio.diversity.beta_diversity for all accepted parameters.
+    subsample_dims : list of str
+        Generate subsampled distance matrix with given dimension
+        (must be smaller than original dimension) for each randomly
+        generated distance matrix. Can specify as int denoting specific
+        dimension or as percentage of, for example '25%', denoting percentage
+        of each main dimension specified in the 'dimensions' parameter.
+    structure : bool
+        Optional. Generate structure in the random data that
+        models distributions of microbial communities.
+        Options: "band" or "block". If not specified, then
+        draws from gaussian distribution to generate distance
+        matrices.
+    overwrite : bool
+        False by default. If True, overwrites output_dir if it already exists.
+
+    """
+
+    if seed is not None:
+        np.random.seed(seed)
+
+    if (not overwrite and os.path.exists(output_dir) and not click.confirm(
+            'The output directory %s exists. '
+            'Do you want to overwrite?' % output_dir)):
+        click.echo('Aborting.')
+        return
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    for dim in dimensions:
+        outpath = os.path.join(output_dir, 'randdm-{}.txt'.format(dim))
+
+        click.echo('Generating random distance matrix: %s' % outpath)
+
+        # Generate random distance matrix
+        if structure is not None:
+            click.echo('Generating synthetic OTU table with "{}"-like '
+                       'microbial distribution...'.format(structure))
+            if structure == 'band':
+                biom_table = band_table(dim, dim, seed=seed)[0]
+            elif structure == 'block':
+                biom_table = block_table(dim, dim, seed=seed)[0]
+            else:
+                raise ValueError('Invalid value for --structure parameter.')
+
+            otu_table = biom_table.matrix_data.todense()
+
+            click.echo('Generating distance matrix from OTU table...')
+            distance_matrix = beta_diversity(distance_metric, otu_table,
+                                             validate=False)
+
+        else:
+            click.echo('Generating random distance matrix '
+                       '(gaussian distribution)')
+            distance_matrix = randdm(dim, constructor=DistanceMatrix)
+
+        # Serialize distance matrix
+        distance_matrix.write(outpath)
+
+        # Subsampling
+        for subsample_dim in subsample_dims:
+            # Parse parameter values into integers
+            try:
+                if '%' in subsample_dim:
+                    percent = float(subsample_dim[:-1]) / 100
+                    subsample_dim = int(percent * dim)
+                else:
+                    subsample_dim = int(subsample_dim)
+            except ValueError:
+                click.echo('Format for --sub parameter is incorrect. Please '
+                           're-run with --help for instructions on accepted '
+                           'formats for this parameter.')
+                return
+
+            if subsample_dim <= 0:
+                raise ValueError('Subsample dimension must be greater than 0')
+
+            if subsample_dim >= dim:
+                raise ValueError('Subsample dimension %d must be smaller than'
+                                 'original matrix dimension %d' % (
+                                     subsample_dim,
+                                     dim))
+            subsampled_outpath = os.path.join(output_dir,
+                                              'randdm-{}-sub-{}.txt'
+                                              .format(dim, subsample_dim))
+
+            click.echo(
+                'Subsampling original randomly generated distance matrix '
+                'with subsample dimension %d: %s' % (subsample_dim,
+                                                     subsampled_outpath))
+
+            ids = distance_matrix.ids
+            # Subsample without replacement
+            subsampled_ids = np.random.choice(ids, subsample_dim,
+                                              replace=False)
+            subsampled_matrix = distance_matrix.filter(subsampled_ids)
+            subsampled_matrix.write(subsampled_outpath)
+
+    click.echo('Done.')
+
+
+if __name__ == '__main__':
+    generate()