-
Notifications
You must be signed in to change notification settings - Fork 41
/
bess_base.py
645 lines (578 loc) · 25 KB
/
bess_base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
import sys
import numbers
import warnings
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y
from sklearn.exceptions import DataConversionWarning
from .pybind_cabess import pywrap_GLM
from .utilities import categorical_to_dummy
class bess_base(BaseEstimator):
r"""
Parameters
----------
path_type : {"seq", "gs"}, optional, default="seq"
The method to be used to select the optimal support size.
- For path_type = "seq", we solve the best subset selection
problem for each size in support_size.
- For path_type = "gs", we solve the best subset selection
problem with support size ranged in (s_min, s_max), where the
specific support size to be considered is
determined by golden section.
support_size : array-like, optional
default=range(min(n, int(n/(log(log(n))log(p))))).
An integer vector representing the alternative support sizes.
Only used when path_type = "seq".
s_min : int, optional, default=0
The lower bound of golden-section-search for sparsity searching.
s_max : int, optional, default=min(n, int(n/(log(log(n))log(p)))).
The higher bound of golden-section-search for sparsity searching.
group : int, optional, default=np.ones(p)
The group index for each variable.
alpha : float, optional, default=0
Constant that multiples the L2 term in loss function, controlling
regularization strength. It should be non-negative.
- If alpha = 0, it indicates ordinary least square.
fit_intercept : bool, optional, default=True
Whether to consider intercept in the model. We assume that the data
has been centered if fit_intercept=False.
ic_type : {'aic', 'bic', 'gic', 'ebic', 'loss'}, optional, default='ebic'
The type of criterion for choosing the support size if `cv=1`.
The full name of each option:
- 'aic': Akaike information criterion
- 'bic': Bayesian information criterion
- 'gic': Generalized information criterion (see [2-4]). It refers to special information criterion (SIC) in [1].
- 'ebic': Extended Bayesian information criterion [5]
- 'loss': Loss value
ic_coef : float, optional, default=1.0
Constant that controls the regularization strength
on chosen information criterion.
cv : int, optional, default=1
The folds number when use the cross-validation method.
- If cv=1, cross-validation would not be used.
- If cv>1, support size will be chosen by CV's test loss,
instead of IC.
cv_score : {'test_loss', ...}, optional, default='test_loss'
The score used on test data for CV.
- All methods support {'test_loss'}.
- LogisticRegression also supports {'roc_auc'}.
- MultinomialRegression also supports {'roc_auc_ovo', 'roc_auc_ovr'},
which indicate "One vs One/Rest" algorithm, respectively.
thread : int, optional, default=1
Max number of multithreads.
- If thread = 0, the maximum number of threads supported by
the device will be used.
A_init : array-like, optional, default=None
Initial active set before the first splicing.
always_select : array-like, optional, default=None
An array contains the indexes of variables
we want to consider in the model. For group selection,
it should be the indexes of groups (start from 0).
max_iter : int, optional, default=20
Maximum number of iterations taken for the
splicing algorithm to converge.
Due to the limitation of loss reduction, the splicing
algorithm must be able to converge.
The number of iterations is only to simplify the implementation.
is_warm_start : bool, optional, default=True
When tuning the optimal parameter combination,
whether to use the last solution
as a warm start to accelerate the iterative
convergence of the splicing algorithm.
screening_size : int, optional, default=-1
The number of variables remaining after screening.
It should be a non-negative number smaller than p,
but larger than any value in support_size.
- If screening_size=-1, screening will not be used.
- If screening_size=0, screening_size will be set as
:math:`\\min(p, int(n / (\\log(\\log(n))\\log(p))))`.
primary_model_fit_max_iter : int, optional, default=10
The maximal number of iteration for primary_model_fit.
primary_model_fit_epsilon : float, optional, default=1e-08
The epsilon (threshold) of iteration for primary_model_fit.
Attributes
----------
coef_ : array-like, shape(p_features, ) or (p_features, M_responses)
Estimated coefficients for the best subset selection problem.
intercept_ : float or array-like, shape(M_responses,)
The intercept in the model when fit_intercept=True.
train_loss_ : float
The loss on training data.
eval_loss_ : float
- If cv=1, it stores the score under chosen information criterion.
- If cv>1, it stores the test loss under cross-validation.
References
----------
- [1] Zhu, J., Wen, C., Zhu, J., Zhang, H., & Wang, X. (2020). A polynomial algorithm for best-subset selection problem. Proceedings of the National Academy of Sciences, 117(52), 33117-33123.
- [2] Tang, B., Zhu, J., Zhu, J., Wang, X., & Zhang, H. (2023). A Consistent and Scalable Algorithm for Best Subset Selection in Single Index Models. arXiv preprint arXiv:2309.06230.
- [3] Zhu, J., Zhu, J., Tang, B., Chen, X., Lin, H., & Wang, X. (2023). Best-subset selection in generalized linear models: A fast and consistent algorithm via splicing technique. arXiv preprint arXiv:2308.00251.
- [4] Zhang, Y., Zhu, J., Zhu, J., & Wang, X. (2023). A splicing approach to best subset of groups selection. INFORMS Journal on Computing, 35(1), 104-119.
- [5] Chen, J., & Chen, Z. (2008). Extended Bayesian information criteria for model selection with large model spaces. Biometrika, 95(3), 759-771.
"""
# attributes
coef_ = None
intercept_ = None
train_loss_ = 0
eval_loss_ = 0
def __init__(
self,
algorithm_type,
model_type,
normalize_type,
path_type="seq",
support_size=None,
s_min=None,
s_max=None,
group=None,
alpha=None,
fit_intercept=True,
ic_type="ebic",
ic_coef=1.0,
cv=1,
cv_score="test_loss",
thread=1,
A_init=None,
always_select=None,
max_iter=20,
exchange_num=5,
is_warm_start=True,
splicing_type=0,
important_search=0,
screening_size=-1,
primary_model_fit_max_iter=10,
primary_model_fit_epsilon=1e-8,
approximate_Newton=False,
covariance_update=False,
# lambda_min=None, lambda_max=None,
# early_stop=False, n_lambda=100,
baseline_model=None,
_estimator_type=None
):
self.algorithm_type = algorithm_type
self.model_type = model_type
self.normalize_type = normalize_type
self.path_type = path_type
self.max_iter = max_iter
self.exchange_num = exchange_num
self.is_warm_start = is_warm_start
self.support_size = support_size
self.alpha = alpha
self.fit_intercept = fit_intercept
self.n_features_in_: int
self.n_iter_: int
self.s_min = s_min
self.s_max = s_max
self.A_init = A_init
self.group = group
# self.lambda_min = None
# self.lambda_max = None
# self.n_lambda = 100
self.ic_type = ic_type
self.ic_coef = ic_coef
self.cv = cv
self.cv_score = cv_score
self.screening_size = screening_size
self.always_select = always_select
self.primary_model_fit_max_iter = primary_model_fit_max_iter
self.primary_model_fit_epsilon = primary_model_fit_epsilon
# self.early_stop = False
self.approximate_Newton = approximate_Newton
self.thread = thread
self.covariance_update = covariance_update
self.splicing_type = splicing_type
self.important_search = important_search
self.baseline_model = baseline_model
self._estimator_type = _estimator_type
self.classes_: np.ndarray
def fit(self,
X=None,
y=None,
is_normal=True,
sample_weight=None,
cv_fold_id=None,
sparse_matrix=False,
beta_low=None,
beta_high=None):
r"""
The fit function is used to transfer
the information of data and return the fit result.
Parameters
----------
X : array-like of shape(n_samples, p_features)
Training data matrix. It should be a numpy array.
y : array-like of shape(n_samples,) or (n_samples, M_responses)
Training response values. It should be a numpy array.
- For regression problem, the element of y should be float.
- For classification problem,
the element of y should be either 0 or 1.
In multinomial regression,
the p features are actually dummy variables.
- For survival data, y should be a :math:`n \times 2` array,
where the columns indicates "censoring" and "time",
respectively.
is_normal : bool, optional, default=True
whether normalize the variables array
before fitting the algorithm.
sample_weight : array-like, shape (n_samples,), optional
Individual weights for each sample. Only used for is_weight=True.
Default=np.ones(n).
cv_fold_id : array-like, shape (n_samples,), optional, default=None
An array indicates different folds in CV.
Samples in the same fold should be given the same number.
sparse_matrix : bool, optional, default=False
Set as True to treat X as sparse matrix during fitting.
It would be automatically set as True when X has the
sparse matrix type defined in scipy.sparse.
"""
# Check that X and y have correct shape
X, y = check_X_y(X,
y,
accept_sparse=True,
multi_output=True,
# y_numeric=True,
dtype='numeric')
# Input check & init:
if isinstance(X, (list, np.ndarray, np.matrix, pd.DataFrame,
coo_matrix, csr_matrix)):
if isinstance(X, (coo_matrix, csr_matrix)):
sparse_matrix = True
# Sort for Cox
if self.model_type == "Cox":
X = X[y[:, 0].argsort()]
y = y[y[:, 0].argsort()]
time = y[:, 0].reshape(-1)
y = y[:, 1].reshape(-1)
# Dummy y & classes
if self.model_type == "Logistic":
y, self.classes_ = categorical_to_dummy(y.squeeze())
if self.classes_.size > 2:
raise ValueError("Up to 2 classes can be given in y.")
if self.classes_.size == 1:
y = np.zeros(X.shape[0])
else:
y = y[:, 1]
elif (self.model_type in ("Multinomial", "Ordinal")
and (len(y.shape) == 1 or y.shape[1] == 1)):
y, self.classes_ = categorical_to_dummy(y.squeeze())
if self.classes_.size == 1:
# add a useless label
y = np.hstack((np.zeros((X.shape[0], 1)), y))
self.classes_ = np.insert(self.classes_, 0, 0)
# multi_output warning
if self.model_type in (
'Lm', 'Logistic', 'Poisson', 'Gamma'):
if len(y.shape) > 1:
warnings.warn(
"A column-vector y was passed "
"when a 1d array was expected",
DataConversionWarning)
y = y.reshape(-1)
# Init
n = X.shape[0]
p = X.shape[1]
self.n_features_in_ = p
if y.ndim == 1:
M = 1
y = y.reshape(len(y), 1)
else:
M = y.shape[1]
else:
raise ValueError("X should be a matrix or sparse matrix.")
# Algorithm_type: abess
if self.algorithm_type == "abess":
algorithm_type_int = 6
else:
raise ValueError("algorithm_type should not be " +
str(self.algorithm_type))
# Model_type: lm, logit, poiss, cox, multi-gaussian, multi-nomial
if self.model_type == "Lm":
model_type_int = 1
elif self.model_type == "Logistic":
model_type_int = 2
elif self.model_type == "Poisson":
model_type_int = 3
elif self.model_type == "Cox":
model_type_int = 4
elif self.model_type == "Multigaussian":
model_type_int = 5
elif self.model_type == "Multinomial":
model_type_int = 6
elif self.model_type == 'Gamma':
model_type_int = 8
elif self.model_type == 'Ordinal':
model_type_int = 9
else:
raise ValueError("model_type should not be " +
str(self.model_type))
# Path_type: seq, gs
if self.path_type == "seq":
path_type_int = 1
elif self.path_type == "gs":
path_type_int = 2
else:
raise ValueError("path_type should be \'seq\' or \'gs\'")
# cv
if (not isinstance(self.cv, int) or self.cv <= 0):
raise ValueError("cv should be an positive integer.")
if self.cv > n:
raise ValueError("cv should be smaller than n.")
# Ic_type: aic, bic, gic, ebic
# cv_score: test_loss, roc_auc
if self.cv == 1:
if self.ic_type == "loss":
eval_type_int = 0
elif self.ic_type == "aic":
eval_type_int = 1
elif self.ic_type == "bic":
eval_type_int = 2
elif self.ic_type == "gic":
eval_type_int = 3
elif self.ic_type == "ebic":
eval_type_int = 4
elif self.ic_type == "hic":
eval_type_int = 5
else:
raise ValueError(
"ic_type should be \"aic\", \"bic\", \"ebic\","
" \"gic\" or \"hic\".")
else:
if self.cv_score == "test_loss":
eval_type_int = 0
elif self.cv_score == "roc_auc" and self.model_type == "Logistic":
eval_type_int = 1
elif (self.cv_score == "roc_auc_ovo" and
self.model_type == "Multinomial"):
eval_type_int = 2
elif (self.cv_score == "roc_auc_ovr" and
self.model_type == "Multinomial"):
eval_type_int = 3
else:
raise ValueError(
"cv_score should be \"test_loss\", "
"\"roc_auc\"(for logistic), "
"\"roc_auc_ovo\"(for multinomial), or "
"\"roc_auc_ovr\"(for multinomial).")
# cv_fold_id
if cv_fold_id is None:
cv_fold_id = np.array([], dtype="int32")
else:
cv_fold_id = np.array(cv_fold_id, dtype="int32")
if cv_fold_id.ndim > 1:
raise ValueError(
"cv_fold_id should be an 1D array of integers.")
if cv_fold_id.size != n:
raise ValueError(
"The length of cv_fold_id should be equal to X.shape[0].")
if len(set(cv_fold_id)) != self.cv:
raise ValueError(
"The number of different masks should be equal to `cv`.")
# A_init
if self.A_init is None:
A_init_list = np.array([], dtype="int32")
else:
A_init_list = np.array(self.A_init, dtype="int32")
if A_init_list.ndim > 1:
raise ValueError("The initial active set should be "
"an 1D array of integers.")
if (A_init_list.min() < 0 or A_init_list.max() >= p):
raise ValueError("A_init contains out-of-range index.")
# Group:
if self.group is None:
g_index = list(range(p))
else:
g = np.array(self.group)
if g.ndim > 1:
raise ValueError("group should be an 1D array of integers.")
if g.size != p:
raise ValueError(
"The length of group should be equal to X.shape[1].")
group_set = list(set(g))
g.sort()
g_index = []
j = 0
for i in group_set:
while g[j] != i:
j += 1
g_index.append(j)
# sample_weight:
if sample_weight is None:
sample_weight = np.ones(n)
else:
sample_weight = np.array(sample_weight, dtype="float")
if sample_weight.ndim > 1:
raise ValueError("sample_weight should be a 1-D array.")
if sample_weight.size != n:
raise ValueError(
"X.shape[0] should be equal to sample_weight.size")
useful_index = list()
for i, w in enumerate(sample_weight):
if w > 0:
useful_index.append(i)
if len(useful_index) < n:
X = X[useful_index, :]
y = y[useful_index, :] if len(y.shape) > 1 else y[useful_index]
sample_weight = sample_weight[useful_index]
n = len(useful_index)
# Path parameters
if path_type_int == 1: # seq
if self.support_size is None:
if (n == 1 or p == 1):
support_sizes = [0, 1]
else:
support_sizes = list(
range(0, max(min(
p,
int(n / (np.log(np.log(n)) * np.log(p)))
), 1)))
else:
if isinstance(self.support_size,
(numbers.Real, numbers.Integral)):
support_sizes = np.empty(1, dtype=int)
support_sizes[0] = self.support_size
elif (np.any(np.array(self.support_size) > p)
or np.any(np.array(self.support_size) < 0)):
raise ValueError(
"All support_size should be between 0 and X.shape[1]")
else:
support_sizes = self.support_size
if self.alpha is None:
alphas = [0]
else:
if isinstance(self.alpha, (numbers.Real, numbers.Integral)):
alphas = np.empty(1, dtype=float)
alphas[0] = self.alpha
else:
alphas = self.alpha
# unused
new_s_min = 0
new_s_max = 0
new_lambda_min = 0
new_lambda_max = 0
elif path_type_int == 2: # gs
new_s_min = 0 \
if self.s_min is None else self.s_min
new_s_max = min(p, int(n / (np.log(np.log(n)) * np.log(p)))) \
if self.s_max is None else self.s_max
new_lambda_min = 0 # \
# if self.lambda_min is None else self.lambda_min
new_lambda_max = 0 # \
# if self.lambda_max is None else self.lambda_max
if new_s_max < new_s_min:
raise ValueError("s_max should be larger than s_min")
# if new_lambda_max < new_lambda_min:
# raise ValueError(
# "lambda_max should be larger than lambda_min.")
# unused
support_sizes = [0]
alphas = [0]
support_sizes = np.array(support_sizes, dtype='int32')
# Exchange_num
if (not isinstance(self.exchange_num, int) or self.exchange_num <= 0):
raise ValueError("exchange_num should be an positive integer.")
# elif (self.exchange_num > min(support_sizes)):
# print("[Warning] exchange_num may be larger than sparsity, "
# "and it would be set up to sparsity.")
# screening
if self.screening_size != -1:
if self.screening_size == 0:
self.screening_size = min(
p, int(n / (np.log(np.log(n)) * np.log(p))))
elif self.screening_size > p:
raise ValueError(
"screening size should be smaller than X.shape[1].")
elif self.screening_size < max(support_sizes):
raise ValueError(
"screening size should be more than max(support_size).")
# Primary fit parameters
if (not isinstance(self.primary_model_fit_max_iter, int)
or self.primary_model_fit_max_iter <= 0):
raise ValueError(
"primary_model_fit_max_iter should be an positive integer.")
if self.primary_model_fit_epsilon < 0:
raise ValueError(
"primary_model_fit_epsilon should be non-negative.")
# Thread
if (not isinstance(self.thread, int) or self.thread < 0):
raise ValueError("thread should be positive number or 0"
" (maximum supported by your device).")
# Splicing type
if self.splicing_type not in (0, 1):
raise ValueError("splicing type should be 0 or 1.")
# Important_search
if (not isinstance(self.important_search, int)
or self.important_search < 0):
raise ValueError(
"important_search should be a non-negative number.")
# Sparse X
if sparse_matrix:
if not isinstance(X, (coo_matrix)):
# print("sparse matrix 1")
nonzero = 0
tmp = np.zeros([X.shape[0] * X.shape[1], 3])
for j in range(X.shape[1]):
for i in range(X.shape[0]):
if X[i, j] != 0.:
tmp[nonzero, :] = np.array([X[i, j], i, j])
nonzero += 1
X = tmp[:nonzero, :]
else:
# print("sparse matrix 2")
tmp = np.zeros([len(X.data), 3])
tmp[:, 1] = X.row
tmp[:, 2] = X.col
tmp[:, 0] = X.data
ind = np.lexsort((tmp[:, 2], tmp[:, 1]))
X = tmp[ind, :]
# normalize
normalize = 0
if is_normal:
normalize = self.normalize_type
# always_select
if self.always_select is None:
always_select_list = np.zeros(0, dtype="int32")
else:
always_select_list = np.array(self.always_select, dtype="int32")
# beta range
if beta_low is None:
beta_low = -sys.float_info.max
if beta_high is None:
beta_high = sys.float_info.max
if beta_low > beta_high:
raise ValueError(
"Please make sure beta_low <= beta_high.")
# unused
n_lambda = 100
early_stop = False
self.n_iter_ = self.max_iter
# wrap with cpp
# print("wrap enter.")#///
if n == 1:
# with only one sample, nothing to be estimated
result = [np.zeros((p, M)), np.zeros(M), 0, 0, 0]
else:
result = pywrap_GLM(
X, y, sample_weight, n, p, normalize, algorithm_type_int,
model_type_int,
self.max_iter, self.exchange_num, path_type_int,
self.is_warm_start, eval_type_int, self.ic_coef, self.cv,
g_index,
support_sizes, alphas, cv_fold_id, new_s_min, new_s_max,
new_lambda_min, new_lambda_max, n_lambda, self.screening_size,
always_select_list, self.primary_model_fit_max_iter,
self.primary_model_fit_epsilon, early_stop,
self.approximate_Newton, self.thread, self.covariance_update,
sparse_matrix, self.splicing_type, self.important_search,
A_init_list, self.fit_intercept, beta_low, beta_high)
self.coef_ = result[0].squeeze()
self.intercept_ = result[1].squeeze()
self.train_loss_ = result[2]
# self.test_loss_ = result[3]
# self.ic_ = result[4]
self.eval_loss_ = result[3] if (self.cv > 1) else result[4]
if self.model_type == "Cox":
self.baseline_model.fit(np.dot(X, self.coef_), y, time)
if self.model_type == "Ordinal" and self.coef_.ndim > 1:
self.coef_ = self.coef_[:, 0]
return self