-
Notifications
You must be signed in to change notification settings - Fork 17
/
optspace.py
209 lines (156 loc) · 6.9 KB
/
optspace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import numpy as np
from deicode._optspace import optspace
from .base import _BaseImpute
from scipy.spatial import distance
import warnings
class OptSpace(_BaseImpute):
def __init__(self, rank=2, iteration=5, tol=1e-5):
"""
OptSpace is a matrix completion algorithm based on a singular value
decomposition (SVD) optimized on a local manifold. It has been shown to
be quite robust to noise in low rank datasets (1).
The objective function that it is trying to optimize
over is given by:
min(P|(Y-U*S*V^{T})|_{2}^{2}
U and V are matrices that are trying to be estimated and S
is analogous to a matrix of eigenvalues. Y are the
observed values and P is a function such that
the errors between Y and USV are only computed
on the nonzero entries.
Parameters
----------
X: numpy.ndarray - a rclr preprocessed matrix of shape (M,N)
N = Features (i.e. OTUs, metabolites)
M = Samples
rank: int, optional : Default is 2
The underlying rank of the default set
to 2 as the default to prevent overfitting.
iteration: int, optional : Default is 5
The number of convex iterations to optimize the solution
If iteration is not specified, then the default iteration is 5.
Which redcues to a satisfactory error threshold.
tol: float, optional : Default is 1e-5
Error reduction break, if the error reduced is
less than this value it will return the solution
Returns
-------
U: numpy.ndarray - "Sample Loadings" or the unitary matrix
having left singular vectors as columns. Of shape (M,rank)
s: numpy.ndarray - The singular values,
sorted in non-increasing order. Of shape (rank,rank).
V: numpy.ndarray - "Feature Loadings" or Unitary matrix
having right singular vectors as rows. Of shape (N,rank)
solution: numpy.ndarray - (U*S*V.transpose()) of shape (M,N)
distance: numpy.ndarray - Distance between each
pair of the two collections of inputs. Of shape (M,M)
Raises
------
ValueError
Raises an error if given rank is less than 2
`ValueError: rank must be at least 2`.
Raises an error iterations is less than one
`ValueError: iterations must be at least 1`.
Raises an error if input is not either dataframe or np.ndarray
`ValueError: Input data is should be type numpy.ndarray`.
Raises an error if input data does not contain any nans or zeros
`ValueError: Data-table contains no missing
data in the format np.nan or 0`.
Raises an error if input data contains infs
`ValueError: Data-table contains either np.inf or -np.inf`.
Raises an error if input data and rank violates min(M,N)<rank
`ValueError: The rank must be significantly less than the
minimum shape of the input table`.
Raises an error if rank*10> M(Samples)
`ValueError: There are not sufficient samples to run
must have rank*10 samples in the table`.
References
----------
.. [1] Keshavan RH, Oh S, Montanari A. 2009. Matrix completion
from a few entries (2009_ IEEE International
Symposium on Information Theory
Examples
--------
>>> from deicode.optspace import OptSpace
>>> from deicode.preprocessing import rclr
>>> import numpy as np
rclr preprocessing
data is numpy.ndarray
- a array of counts (samples,features)
(with shape (M,N) where N>M)
>>> data=np.array([[3, 3, 0], [0, 4, 2], [3, 0, 1]])
>>> table_rclr=rclr().fit_transform(data)
OptSpace (RPCA)
>>> opt=OptSpace().fit(table_rclr)
numpy.ndarray - "Sample Loadings"
>>> U=opt.sample_weights
numpy.ndarray - "Feature Loadings"
>>> V=opt.feature_weights
numpy.ndarray - The singular values
>>> s=opt.s
numpy.ndarray - (U*S*V.transpose()) of shape (M,N)
>>> result=opt.solution
or
>>> U,s,V=OptSpace().fit_transform(table_rclr)
numpy.ndarray - fully dense (no zeros) of shape (M,N)
>>> result=np.dot(np.dot(U,s),V.T)
"""
self.rank = rank
self.iteration = iteration
self.tol = tol
if self.rank < 2:
raise ValueError("rank must be at least 2")
if self.iteration < 1:
raise ValueError("iterations must be at least 1")
return
def fit(self, X):
"""
Fit the model to X_sparse
"""
X_sparse = X.copy().astype(np.float64)
self.X_sparse = X_sparse
self._fit()
return self
def _fit(self):
# make copy for imputation, check type
X_sparse = self.X_sparse
if not isinstance(X_sparse, np.ndarray):
X_sparse = np.array(X_sparse)
if not isinstance(X_sparse, np.ndarray):
raise ValueError('Input data is should be type numpy.ndarray')
if (np.count_nonzero(X_sparse) == 0 and
np.count_nonzero(~np.isnan(X_sparse)) == 0):
raise ValueError('No missing data in the format np.nan or 0')
if np.count_nonzero(np.isinf(X_sparse)) != 0:
raise ValueError('Contains either np.inf or -np.inf')
if self.rank > np.min(X_sparse.shape):
raise ValueError('rank must be less than the minimum shape')
if self.rank * 10 > np.min(X_sparse.shape):
warnings.warn(
'Insufficient samples, must have rank*10 samples in the table')
# return solved matrix
self.U, self.s, self.V, _ = optspace(X_sparse, r=self.rank,
niter=self.iteration,
tol=self.tol)
# save the solution (of the imputation)
self.solution = self.U.dot(self.s).dot(self.V.T)
self.eigenvalues = np.diag(self.s)
self.explained_variance_ratio = list(
self.eigenvalues / self.eigenvalues.sum())
self.distance = distance.cdist(self.U, self.U)
self.feature_weights = self.V
self.sample_weights = self.U
def fit_transform(self, X):
"""
Returns the final SVD of
U: numpy.ndarray - "Sample Loadings" or the
unitary matrix having left singular
vectors as columns. Of shape (M,rank)
s: numpy.ndarray - The singular values,
sorted in non-increasing order. Of shape (rank,rank).
V: numpy.ndarray - "Feature Loadings" or Unitary matrix
having right singular vectors as rows. Of shape (N,rank)
"""
X_sparse = X.copy().astype(np.float64)
self.X_sparse = X_sparse
self._fit()
return self.sample_weights, self.s, self.feature_weights