/
matrix_completion.py
169 lines (131 loc) · 5.63 KB
/
matrix_completion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# ----------------------------------------------------------------------------
# Copyright (c) 2019--, gemelli development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
from gemelli.optspace import OptSpace
from .base import _BaseImpute
from scipy.spatial import distance
class MatrixCompletion(_BaseImpute):
def __init__(self, n_components=2, max_iterations=5, tol=1e-5):
"""
This form of matrix completion uses OptSpace (1). Furthermore,
here we directly interpret the loadings generated from matrix
completion as a dimensionality reduction.
Parameters
----------
X: numpy.ndarray - a rclr preprocessed matrix of shape (M,N)
N = Features (i.e. OTUs, metabolites)
M = Samples
n_components: int, optional : Default is 2
The underlying rank of the default set
to 2 as the default to prevent overfitting.
max_iterations: int, optional : Default is 5
The number of convex iterations to optimize the solution
If iteration is not specified, then the default iteration is 5.
Which redcues to a satisfactory error threshold.
tol: float, optional : Default is 1e-5
Error reduction break, if the error reduced is
less than this value it will return the solution
Returns
-------
U: numpy.ndarray - "Sample Loadings" or the unitary matrix
having left singular vectors as columns. Of shape (M,n_components)
s: numpy.ndarray - The singular values,
sorted in non-increasing order. Of shape (n_components,n_components).
V: numpy.ndarray - "Feature Loadings" or Unitary matrix
having right singular vectors as rows. Of shape (N,n_components)
solution: numpy.ndarray - (U*S*V.transpose()) of shape (M,N)
distance: numpy.ndarray - Distance between each
pair of the two collections of inputs. Of shape (M,M)
Raises
------
ValueError
ValueError
`ValueError: n_components must be at least 2`.
ValueError
`ValueError: max_iterations must be at least 1`.
ValueError
`ValueError: Data-table contains either np.inf or -np.inf`.
ValueError
`ValueError: The n_components must be less
than the minimum shape of the input table`.
References
----------
.. [1] Keshavan RH, Oh S, Montanari A. 2009. Matrix completion
from a few entries (2009_ IEEE International
Symposium on Information Theory
Examples
--------
TODO
"""
self.n_components = n_components
self.max_iterations = max_iterations
self.tol = tol
return
def fit(self, X, y=None):
"""
Fit the model to X_sparse
"""
X_sparse = X.copy().astype(np.float64)
self.X_sparse = X_sparse
self._fit()
return self
def _fit(self):
# make copy for imputation, check type
X_sparse = self.X_sparse
n, m = X_sparse.shape
# make sure the data is sparse (otherwise why?)
if np.count_nonzero(np.isinf(X_sparse)) != 0:
raise ValueError('Contains either np.inf or -np.inf')
# test n-iter
if self.max_iterations < 1:
raise ValueError("max_iterations must be at least 1")
# check the settings for n_components
if isinstance(self.n_components, str) and \
self.n_components.lower() == 'auto':
# estimate the rank of the matrix
self.n_components = 'auto'
# check hardset values
elif isinstance(self.n_components, int):
if self.n_components > (min(n, m) - 1):
raise ValueError("n-components must be at most"
" 1 minus the min. shape of the"
" input matrix.")
if self.n_components < 2:
raise ValueError("n-components must "
"be at least 2")
# otherwise rase an error.
else:
raise ValueError("n-components must be "
"an interger or 'auto'")
# return solved matrix
self.U, self.s, self.V = OptSpace(n_components=self.n_components,
max_iterations=self.max_iterations,
tol=self.tol).solve(X_sparse)
# save the solution (of the imputation)
self.solution = self.U.dot(self.s).dot(self.V.T)
self.eigenvalues = np.diag(self.s)
self.explained_variance_ratio = list(
self.eigenvalues / self.eigenvalues.sum())
self.distance = distance.cdist(self.U, self.U)
self.feature_weights = self.V
self.sample_weights = self.U
def fit_transform(self, X, y=None):
"""
Returns the final SVD of
U: numpy.ndarray - "Sample Loadings" or the
unitary matrix having left singular
vectors as columns. Of shape (M,n_components)
s: numpy.ndarray - The singular values,
sorted in non-increasing order. Of shape (n_components,n_components).
V: numpy.ndarray - "Feature Loadings" or Unitary matrix
having right singular vectors as rows. Of shape (N,n_components)
"""
X_sparse = X.copy().astype(np.float64)
self.X_sparse = X_sparse
self._fit()
return self.sample_weights