Skip to content

Commit

Permalink
Merge branch 'master' of github.com:biocore/mds-approximations into t…
Browse files Browse the repository at this point in the history
…est_data_file_cleanup
  • Loading branch information
HannesHolste committed Oct 3, 2019
2 parents 53b35c8 + 6f52697 commit 9aa2c90
Show file tree
Hide file tree
Showing 10 changed files with 198 additions and 104 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
bayesian-regression/

# OS generated files
.DS_Store
.DS_Store?
Expand Down Expand Up @@ -53,4 +55,4 @@ pip-log.txt
# Generated PCoA output of algorithms
out/*

.env
.env
12 changes: 6 additions & 6 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,23 @@
sudo: false
language: python
env:
- PYTHON_VERSION=3
- PYTHON_VERSION=3.6
before_install:
- wget http://repo.continuum.io/miniconda/Miniconda3-3.7.3-Linux-x86_64.sh -O miniconda.sh
- wget http://repo.continuum.io/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh -O miniconda.sh
- chmod +x miniconda.sh
- ./miniconda.sh -b
- export PATH=/home/travis/miniconda3/bin:$PATH
# Update conda itself
- conda update --yes conda
install:
- conda env create -n mds-approximations -f environment.yml
- conda install --yes cython nose pep8 flake8 pip
#- if [ ${USE_CYTHON} ]; then conda install --yes -n mds-approximations cython; fi
- conda env create -n mds-approximations -f environment.yml python=3.6
- source activate mds-approximations
- conda install --yes cython nose pep8 flake8 pip
# - if [ ${USE_CYTHON} ]; then conda install --yes cython; fi
- pip install .
- pip install coveralls
script:
- nosetests --with-coverage --cover-package=mdsa
- flake8 setup.py mdsa
after_success:
- coveralls
- coveralls
Binary file added bayesian_regression-0.1-py3-none-any.whl
Binary file not shown.
8 changes: 4 additions & 4 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ channels:
- bioconda
- biocore
dependencies:
- scikit-bio=0.4.2
- click=6.7
- python=3
- scikit-bio
- click
- numpy
- scikit-learn=0.19.1
- scikit-learn
- scipy
- pandas
- jupyter
- matplotlib
- python=3.7
86 changes: 6 additions & 80 deletions mdsa/algorithms/fsvd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from numpy import dot, hstack
from numpy.linalg import qr, svd
from numpy.random import standard_normal
from skbio import DistanceMatrix
from skbio.stats.ordination import pcoa as skbio_pcoa

from mdsa.algorithm import Algorithm

Expand Down Expand Up @@ -47,80 +46,7 @@ def run(self, distance_matrix, num_dimensions_out=10,
"""
super(FSVD, self).run(distance_matrix, num_dimensions_out)

m, n = distance_matrix.shape

# Note: this transpose is removed for performance, since we
# only expect square matrices.
# Take (conjugate) transpose if necessary, because it makes H smaller,
# leading to faster computations
# if m < n:
# distance_matrix = distance_matrix.transpose()
# m, n = distance_matrix.shape
if m != n:
raise ValueError('FSVD.run(...) expects square distance matrix')

k = num_dimensions_out + 2

# Form a real nxl matrix G whose entries are independent,
# identically distributed
# Gaussian random variables of
# zero mean and unit variance
G = standard_normal(size=(n, k))

if use_power_method:
# use only the given exponent
H = dot(distance_matrix, G)

for x in range(2, num_levels + 2):
# enhance decay of singular values
# note: distance_matrix is no longer transposed, saves work
# since we're expecting symmetric, square matrices anyway
# (Daniel McDonald's changes)
H = dot(distance_matrix, dot(distance_matrix, H))

else:
# compute the m x l matrices H^{(0)}, ..., H^{(i)}
# Note that this is done implicitly in each iteration below.
H = dot(distance_matrix, G)
# Again, removed transpose: dot(distance_matrix.transpose(), H)
# to enhance performance
H = hstack(
(H, dot(distance_matrix, dot(distance_matrix, H))))
for x in range(3, num_levels + 2):
# Removed this transpose: dot(distance_matrix.transpose(), H)
tmp = dot(distance_matrix, dot(distance_matrix, H))

# Removed this transpose: dot(distance_matrix.transpose(), tmp)
H = hstack(
(H, dot(distance_matrix, dot(distance_matrix, tmp))))

# Using the pivoted QR-decomposiion, form a real m * ((i+1)l) matrix Q
# whose columns are orthonormal, s.t. there exists a real
# ((i+1)l) * ((i+1)l) matrix R for which H = QR
Q, R = qr(H)

# Compute the n * ((i+1)l) product matrix T = A^T Q
# Removed transpose of distance_matrix for performance
T = dot(distance_matrix, Q) # step 3

# Form an SVD of T
Vt, St, W = svd(T, full_matrices=False)
W = W.transpose()

# Compute the m * ((i+1)l) product matrix
Ut = dot(Q, W)

if m < n:
# V_fsvd = Ut[:, :num_dimensions_out] # unused
U_fsvd = Vt[:, :num_dimensions_out]
else:
# V_fsvd = Vt[:, :num_dimensions_out] # unused
U_fsvd = Ut[:, :num_dimensions_out]

S = St[:num_dimensions_out] ** 2

# drop imaginary component, if we got one
eigenvalues = S.real
eigenvectors = U_fsvd.real

return eigenvectors, eigenvalues
results = skbio_pcoa(DistanceMatrix(distance_matrix),
method='fsvd',
number_of_dimensions=num_dimensions_out)
return results.samples.values, results.eigvals
4 changes: 2 additions & 2 deletions mdsa/algorithms/nystrom.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@
/ E | F \
D = |-----|----|
\ F.t | G /
\\ F.t | G /
The correspondong association matrix or centered inner-product
matrix K is:
/ A | B \
K = |-----|----|
\ B.t | C /
\\ B.t | C /
where A and B are computed as follows
Expand Down
4 changes: 2 additions & 2 deletions mdsa/algorithms/scmds.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,8 @@ def affine_mapping(self, matrix_x, matrix_y):
Affine mapping function:
Y = UX + kron(b,ones(1,n)), UU' = I
X = [x_1, x_2, ... , x_n]; x_j \in R^m
Y = [y_1, y_2, ... , y_n]; y_j \in R^m
X = [x_1, x_2, ... , x_n]; x_j \\in R^m
Y = [y_1, y_2, ... , y_n]; y_j \\in R^m
From Tzeng 2008:
`The projection of xi,2 from q2 dimension to q1 dimension
Expand Down
17 changes: 9 additions & 8 deletions mdsa/pcoa.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
If the distance is not euclidean (for example if it is a
semimetric and the triangle inequality doesn't hold),
negative eigenvalues can appear. There are different ways
to deal with that problem (see Legendre & Legendre 1998, \S
to deal with that problem (see Legendre & Legendre 1998, \\S
9.2.3), but none are currently implemented here.
However, a warning is raised whenever negative eigenvalues
appear, allowing the user to decide if they can be safely
Expand All @@ -90,10 +90,6 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
eigenvectors = np.array(eigenvectors)
eigenvalues = np.array(eigenvalues)

# Ensure eigenvectors are normalized
eigenvectors = np.apply_along_axis(lambda vec: vec / np.linalg.norm(vec),
axis=1, arr=eigenvectors)

# Generate axis labels for output
axis_labels = ['PC%d' % i for i in range(1, len(eigenvectors) + 1)]
axis_labels = axis_labels[:num_dimensions_out]
Expand Down Expand Up @@ -170,6 +166,13 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
# operation.
eigenvectors = eigenvectors * np.sqrt(eigenvalues)

# Calculate the array of proportion of variance explained
# proportion_explained = eigenvalues / eigenvalues.sum()
sum_eigenvalues = np.trace(centered_dm)

# Calculate the array of proportion of variance explained
proportion_explained = eigenvalues / sum_eigenvalues

# Now remove the dimensions with the least information
# Only select k (num_dimensions_out) first eigenvectors
# and their corresponding eigenvalues from the sorted array
Expand All @@ -178,9 +181,7 @@ def pcoa(distance_matrix, algorithm, num_dimensions_out=10):
if len(eigenvalues) > num_dimensions_out:
eigenvectors = eigenvectors[:, :num_dimensions_out]
eigenvalues = eigenvalues[:num_dimensions_out]

# Calculate the array of proportion of variance explained
proportion_explained = eigenvalues / eigenvalues.sum()
proportion_explained = proportion_explained[:num_dimensions_out]

return OrdinationResults(
short_method_name='PCoA',
Expand Down
164 changes: 164 additions & 0 deletions scripts/randdm
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env python

import os

import click
import numpy as np
from skbio import DistanceMatrix
from skbio.stats.distance import randdm
from bayesian_regression.util.generators import band_table, block_table
from skbio.diversity import beta_diversity


@click.command()
@click.option('--structure', 'structure', type=str,
help='Optional. Generate structure in the random data that '
'models distributions of microbial communities. '
'Options: "band" or "block". If not specified, then '
'draws from gaussian distribution to generate distance '
'matrices.', default=None)
@click.option('--structure-distance-metric', 'distance_metric', type=str,
help='Distance metric to use when generating distance matrix'
' using the --structure flag. "braycurtis" by default',
default='braycurtis')
@click.option('--dim', 'dimensions', type=int, multiple=True,
help='Dimension of the input matrix/matrices to generate')
@click.option('--seed', 'seed', type=int,
help='Random number generator seed value.')
@click.option('--sub', 'subsample_dims', type=str, multiple=True,
help='Generate subsampled distance matrix with given'
'dimension (must be smaller than original dimension)'
' for each randomly generated distance matrix.'
' Can specify as int or percentage')
@click.option('--overwrite', 'overwrite', type=bool, default=False,
help='Overwrite output directory if it already exists. Do not '
'overwrite, by default.')
@click.argument('output_dir', type=click.Path(), default=None)
def generate(dimensions, output_dir, seed, distance_metric, subsample_dims,
structure, overwrite):
"""
Generate random distance matrix. By default, generates a random distance
matrix drawing from a gaussian probability distribution. Since realistic
microbiome OTU tables and distance matrices typically do not follow such
a probability distribution, as they often contain "structure", i.e.
discrete block-like patterns or patterns along a gradient, another
option is provided to generate more realistic microbiome data that mimics
block or band-like structure, e.g. block-like could represent one grou of
microbes associated with a diseased state and another set associated with a
healthy state.
Parameters
----------
dimensions : number
Dimension of the input matrix/matrices to generate.
output_dir : str
Path to the directory to output results to.
seed : number
Random seed for generating distance matrices. Optional.
distance_metric : str
Distance metric to use when generating distance matrix
using the --structure flag. "braycurtis" by default.
See skbio.diversity.beta_diversity for all accepted parameters.
subsample_dims : list of str
Generate subsampled distance matrix with given dimension
(must be smaller than original dimension) for each randomly
generated distance matrix. Can specify as int denoting specific
dimension or as percentage of, for example '25%', denoting percentage
of each main dimension specified in the 'dimensions' parameter.
structure : bool
Optional. Generate structure in the random data that
models distributions of microbial communities.
Options: "band" or "block". If not specified, then
draws from gaussian distribution to generate distance
matrices.
overwrite : bool
False by default. If True, overwrites output_dir if it already exists.
"""

if seed is not None:
np.random.seed(seed)

if (not overwrite and os.path.exists(output_dir) and not click.confirm(
'The output directory %s exists. '
'Do you want to overwrite?' % output_dir)):
click.echo('Aborting.')
return

if not os.path.exists(output_dir):
os.makedirs(output_dir)

for dim in dimensions:
outpath = os.path.join(output_dir, 'randdm-{}.txt'.format(dim))

click.echo('Generating random distance matrix: %s' % outpath)

# Generate random distance matrix
if structure is not None:
click.echo('Generating synthetic OTU table with "{}"-like '
'microbial distribution...'.format(structure))
if structure == 'band':
biom_table = band_table(dim, dim, seed=seed)[0]
elif structure == 'block':
biom_table = block_table(dim, dim, seed=seed)[0]
else:
raise ValueError('Invalid value for --structure parameter.')

otu_table = biom_table.matrix_data.todense()

click.echo('Generating distance matrix from OTU table...')
distance_matrix = beta_diversity(distance_metric, otu_table,
validate=False)

else:
click.echo('Generating random distance matrix '
'(gaussian distribution)')
distance_matrix = randdm(dim, constructor=DistanceMatrix)

# Serialize distance matrix
distance_matrix.write(outpath)

# Subsampling
for subsample_dim in subsample_dims:
# Parse parameter values into integers
try:
if '%' in subsample_dim:
percent = float(subsample_dim[:-1]) / 100
subsample_dim = int(percent * dim)
else:
subsample_dim = int(subsample_dim)
except ValueError:
click.echo('Format for --sub parameter is incorrect. Please '
're-run with --help for instructions on accepted '
'formats for this parameter.')
return

if subsample_dim <= 0:
raise ValueError('Subsample dimension must be greater than 0')

if subsample_dim >= dim:
raise ValueError('Subsample dimension %d must be smaller than'
'original matrix dimension %d' % (
subsample_dim,
dim))
subsampled_outpath = os.path.join(output_dir,
'randdm-{}-sub-{}.txt'
.format(dim, subsample_dim))

click.echo(
'Subsampling original randomly generated distance matrix '
'with subsample dimension %d: %s' % (subsample_dim,
subsampled_outpath))

ids = distance_matrix.ids
# Subsample without replacement
subsampled_ids = np.random.choice(ids, subsample_dim,
replace=False)
subsampled_matrix = distance_matrix.filter(subsampled_ids)
subsampled_matrix.write(subsampled_outpath)

click.echo('Done.')


if __name__ == '__main__':
generate()

0 comments on commit 9aa2c90

Please sign in to comment.