PCoA updates (#46)

* Remove normalization of eigenvectors * Fix proportions explained to use eigenvalues computed via matrix trace * Update miniconda, python, skbio to support new fsvd * Fix flake8 warnings * Update environment * update environment again * fixing travis (#48)
biocore · May 13, 2019 · 6f52697 · 6f52697
1 parent 5911f9c
commit 6f52697
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 109 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -3,23 +3,23 @@
 sudo: false
 language: python
 env:
-  - PYTHON_VERSION=3
+  - PYTHON_VERSION=3.6
 before_install:
-  - wget http://repo.continuum.io/miniconda/Miniconda3-3.7.3-Linux-x86_64.sh -O miniconda.sh
+  - wget http://repo.continuum.io/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh -O miniconda.sh
   - chmod +x miniconda.sh
   - ./miniconda.sh -b
   - export PATH=/home/travis/miniconda3/bin:$PATH
   # Update conda itself
   - conda update --yes conda
 install:
-  - conda env create -n mds-approximations -f environment.yml
-  - conda install --yes cython nose pep8 flake8 pip
-  #- if [ ${USE_CYTHON} ]; then conda install --yes -n mds-approximations cython; fi
+  - conda env create -n mds-approximations -f environment.yml python=3.6
   - source activate mds-approximations
+  - conda install --yes cython nose pep8 flake8 pip
+  # - if [ ${USE_CYTHON} ]; then conda install --yes cython; fi
   - pip install .
   - pip install coveralls
 script:
   - nosetests --with-coverage --cover-package=mdsa
   - flake8 setup.py mdsa
 after_success:
-  - coveralls
+  - coveralls
diff --git a/environment.yml b/environment.yml
@@ -5,21 +5,13 @@ channels:
 - defaults
 - bioconda
 - biocore
-- anaconda
 dependencies:
 - scikit-bio
-- click=6.7 # install from anaconda channel
-- python=3.5
+- click
 - numpy
-- scikit-learn=0.19.1
+- scikit-learn
 - scipy
 - pandas
 - jupyter
 - matplotlib
-- gneiss=0.4.2
-- pytz
-- tensorflow
-- pip
-# also need to install bayesian-regression:
-- pip:
-  - bayesian_regression-0.1-py3-none-any.whl
+- python=3.7
diff --git a/mdsa/algorithms/fsvd.py b/mdsa/algorithms/fsvd.py
@@ -1,6 +1,5 @@
-from numpy import dot, hstack
-from numpy.linalg import qr, svd
-from numpy.random import standard_normal
+from skbio import DistanceMatrix
+from skbio.stats.ordination import pcoa as skbio_pcoa
 
 from mdsa.algorithm import Algorithm
 
@@ -47,80 +46,7 @@ def run(self, distance_matrix, num_dimensions_out=10,
         """
         super(FSVD, self).run(distance_matrix, num_dimensions_out)
 
-        m, n = distance_matrix.shape
-
-        # Note: this transpose is removed for performance, since we
-        # only expect square matrices.
-        # Take (conjugate) transpose if necessary, because it makes H smaller,
-        # leading to faster computations
-        # if m < n:
-        #     distance_matrix = distance_matrix.transpose()
-        #     m, n = distance_matrix.shape
-        if m != n:
-            raise ValueError('FSVD.run(...) expects square distance matrix')
-
-        k = num_dimensions_out + 2
-
-        # Form a real nxl matrix G whose entries are independent,
-        # identically distributed
-        # Gaussian random variables of
-        # zero mean and unit variance
-        G = standard_normal(size=(n, k))
-
-        if use_power_method:
-            # use only the given exponent
-            H = dot(distance_matrix, G)
-
-            for x in range(2, num_levels + 2):
-                # enhance decay of singular values
-                # note: distance_matrix is no longer transposed, saves work
-                # since we're expecting symmetric, square matrices anyway
-                # (Daniel McDonald's changes)
-                H = dot(distance_matrix, dot(distance_matrix, H))
-
-        else:
-            # compute the m x l matrices H^{(0)}, ..., H^{(i)}
-            # Note that this is done implicitly in each iteration below.
-            H = dot(distance_matrix, G)
-            # Again, removed transpose: dot(distance_matrix.transpose(), H)
-            # to enhance performance
-            H = hstack(
-                (H, dot(distance_matrix, dot(distance_matrix, H))))
-            for x in range(3, num_levels + 2):
-                # Removed this transpose: dot(distance_matrix.transpose(), H)
-                tmp = dot(distance_matrix, dot(distance_matrix, H))
-
-                # Removed this transpose: dot(distance_matrix.transpose(), tmp)
-                H = hstack(
-                    (H, dot(distance_matrix, dot(distance_matrix, tmp))))
-
-        # Using the pivoted QR-decomposiion, form a real m * ((i+1)l) matrix Q
-        # whose columns are orthonormal, s.t. there exists a real
-        # ((i+1)l) * ((i+1)l) matrix R for which H = QR
-        Q, R = qr(H)
-
-        # Compute the n * ((i+1)l) product matrix T = A^T Q
-        # Removed transpose of distance_matrix for performance
-        T = dot(distance_matrix, Q)  # step 3
-
-        # Form an SVD of T
-        Vt, St, W = svd(T, full_matrices=False)
-        W = W.transpose()
-
-        # Compute the m * ((i+1)l) product matrix
-        Ut = dot(Q, W)
-
-        if m < n:
-            # V_fsvd = Ut[:, :num_dimensions_out] # unused
-            U_fsvd = Vt[:, :num_dimensions_out]
-        else:
-            # V_fsvd = Vt[:, :num_dimensions_out] # unused
-            U_fsvd = Ut[:, :num_dimensions_out]
-
-        S = St[:num_dimensions_out] ** 2
-
-        # drop imaginary component, if we got one
-        eigenvalues = S.real
-        eigenvectors = U_fsvd.real
-
-        return eigenvectors, eigenvalues
+        results = skbio_pcoa(DistanceMatrix(distance_matrix),
+                             method='fsvd',
+                             number_of_dimensions=num_dimensions_out)
+        return results.samples.values, results.eigvals
diff --git a/mdsa/algorithms/nystrom.py b/mdsa/algorithms/nystrom.py
@@ -40,15 +40,15 @@
 
         /  E  |  F \
     D = |-----|----|
-        \ F.t |  G /
+        \\ F.t |  G /
 
 
 The correspondong association matrix or centered inner-product
 matrix K is:
 
         /  A  |  B \
     K = |-----|----|
-        \ B.t |  C /
+        \\ B.t |  C /
 
 where A and B are computed as follows
 

diff --git a/mdsa/algorithms/scmds.py b/mdsa/algorithms/scmds.py
@@ -220,8 +220,8 @@ def affine_mapping(self, matrix_x, matrix_y):
 
         Affine mapping function:
         Y = UX + kron(b,ones(1,n)), UU' = I
-        X = [x_1, x_2, ... , x_n]; x_j \in R^m
-        Y = [y_1, y_2, ... , y_n]; y_j \in R^m
+        X = [x_1, x_2, ... , x_n]; x_j \\in R^m
+        Y = [y_1, y_2, ... , y_n]; y_j \\in R^m
 
         From Tzeng 2008:
         `The projection of xi,2 from q2 dimension to q1 dimension

diff --git a/mdsa/pcoa.py b/mdsa/pcoa.py
@@ -64,7 +64,7 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
        If the distance is not euclidean (for example if it is a
        semimetric and the triangle inequality doesn't hold),
        negative eigenvalues can appear. There are different ways
-       to deal with that problem (see Legendre & Legendre 1998, \S
+       to deal with that problem (see Legendre & Legendre 1998, \\S
        9.2.3), but none are currently implemented here.
        However, a warning is raised whenever negative eigenvalues
        appear, allowing the user to decide if they can be safely
@@ -90,10 +90,6 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
     eigenvectors = np.array(eigenvectors)
     eigenvalues = np.array(eigenvalues)
 
-    # Ensure eigenvectors are normalized
-    eigenvectors = np.apply_along_axis(lambda vec: vec / np.linalg.norm(vec),
-                                       axis=1, arr=eigenvectors)
-
     # Generate axis labels for output
     axis_labels = ['PC%d' % i for i in range(1, len(eigenvectors) + 1)]
     axis_labels = axis_labels[:num_dimensions_out]
@@ -170,6 +166,13 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
         # operation.
         eigenvectors = eigenvectors * np.sqrt(eigenvalues)
 
+        # Calculate the array of proportion of variance explained
+        # proportion_explained = eigenvalues / eigenvalues.sum()
+        sum_eigenvalues = np.trace(centered_dm)
+
+        # Calculate the array of proportion of variance explained
+        proportion_explained = eigenvalues / sum_eigenvalues
+
         # Now remove the dimensions with the least information
         # Only select k (num_dimensions_out) first eigenvectors
         # and their corresponding eigenvalues from the sorted array
@@ -178,9 +181,7 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
         if len(eigenvalues) > num_dimensions_out:
             eigenvectors = eigenvectors[:, :num_dimensions_out]
             eigenvalues = eigenvalues[:num_dimensions_out]
-
-        # Calculate the array of proportion of variance explained
-        proportion_explained = eigenvalues / eigenvalues.sum()
+            proportion_explained = proportion_explained[:num_dimensions_out]
 
         return OrdinationResults(
             short_method_name='PCoA',