-
Notifications
You must be signed in to change notification settings - Fork 12
/
BVAR_NIG_DPD.py
4312 lines (3887 loc) · 205 KB
/
BVAR_NIG_DPD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 29 11:14:05 2018
@author: jeremiasknoblauch
Description: Implements the VB-approximation of the Density Power Divergence
parameter inference for the NIG model inside BOCPDMS
"""
import numpy as np
from scipy import special
from scipy import linalg
from scipy import stats
import scipy
from probability_model import ProbabilityModel
from nearestPD import NPD
from BVAR_NIG import BVARNIG
class BVARNIGDPD(BVARNIG):
"""Implements the constructor using the superclass BVARNIG"""
def __init__(self,
prior_a,
prior_b,
S1,
S2,
alpha_param = 0.5,
prior_mean_beta=None,
prior_var_beta=None,
prior_mean_scale=0,
prior_var_scale=100,
nbh_sequence=None,
restriction_sequence = None,
intercept_grouping = None,
general_nbh_sequence = None,
general_nbh_restriction_sequence = None,
exo_selection = None,
padding = 'overall_mean',
auto_prior_update=False,
hyperparameter_optimization = "online",
general_nbh_coupling = "strong coupling",
non_spd_alerts =False,
VB_window_size = 25,
full_opt_thinning = None,
full_opt_thinning_schedule = None,
SGD_batch_size = 3,
anchor_batch_size_SCSG = 30,
anchor_batch_size_SVRG = None,
first_full_opt = 3
):
"""Take inputs, relabel them"""
prior_a = prior_a
prior_b = prior_b
S1, S2 = S1,S2
prior_mean_beta=prior_mean_beta
prior_var_beta=prior_var_beta
prior_mean_scale=prior_mean_scale
prior_var_scale=prior_var_scale
nbh_sequence=nbh_sequence
restriction_sequence = restriction_sequence
intercept_grouping = intercept_grouping
general_nbh_sequence = general_nbh_sequence
general_nbh_restriction_sequence = general_nbh_restriction_sequence
exo_selection = exo_selection
padding = padding
auto_prior_update=auto_prior_update
hyperparameter_optimization = hyperparameter_optimization
general_nbh_coupling = general_nbh_coupling
non_spd_alerts =non_spd_alerts
"""instantiate object"""
super(BVARNIGDPD, self).__init__(
prior_a = prior_a ,
prior_b = prior_b ,
S1 = S1, S2 = S2,
prior_mean_beta=prior_mean_beta,
prior_var_beta=prior_var_beta,
prior_mean_scale=prior_mean_scale,
prior_var_scale=prior_var_scale,
nbh_sequence=nbh_sequence,
restriction_sequence = restriction_sequence,
intercept_grouping = intercept_grouping,
general_nbh_sequence = general_nbh_sequence,
general_nbh_restriction_sequence = general_nbh_restriction_sequence,
exo_selection = exo_selection,
padding = padding,
auto_prior_update=auto_prior_update,
hyperparameter_optimization = hyperparameter_optimization,
general_nbh_coupling = general_nbh_coupling,
non_spd_alerts =non_spd_alerts)
"""add features to object that are unique to DPD version"""
self.VB_window_size = VB_window_size
if VB_window_size is None: #in this case, we have W = max(RL)
self.VB_window_size = 101 #extend/prune in units of 100
self.dynamic_VB_window_size = True
self.max_rl = 1
else:
self.dynamic_VB_window_size = False
self.max_rl = 1
self.alpha_param_learning = False #initialized via Detector init
self.alpha_param = alpha_param
self.full_opt_thinning = full_opt_thinning
"""If we want to have a different opt. schedule"""
#if full_opt_thinning_schedule is not None:
self.full_opt_thinning_schedule = full_opt_thinning_schedule
self.SGD_approx_goodness = SGD_batch_size
self.first_full_opt = first_full_opt
self.failed_opt_count = 0
self.opt_count = 0
#This ensures that the trimmer only trims the _p_eps/_m_eps quantities
#if they are ever created.
self.a_rt_p_eps = None
#Similarly to a_rt_p_eps being initialized as None, we use this for
#the trimmer only
self.predictive_variance_log_det = None
self.alpha_param_list = []
self.N_j = None
if anchor_batch_size_SCSG is None:
if self.dynamic_VB_window_size == False:
self.anchor_batch_size_SCSG = self.VB_window_size
else:
self.anchor_batch_size_SCSG = 100
else:
self.anchor_batch_size_SCSG = anchor_batch_size_SCSG
if anchor_batch_size_SVRG is None:
if self.dynamic_VB_window_size == False:
self.anchor_batch_size_SVRG = self.VB_window_size
else:
self.anchor_batch_size_SVRG = 100
else:
self.anchor_batch_size_SVRG = anchor_batch_size_SVRG
#bounds for optimization
#initialization needs to ininitalize different objects to be tracked,
#e.g. L (the Cholesky decomp of Sigma), and a storage for the X'X
#instead of a storage for the M and so on. Note that big parts of
#the BVARNIG object can be copied 1:1
def initialization(self, X_endo, X_exo, Y_2, X_exo_2, cp_model, model_prior,
padding_columns_computeXX = None, padding_column_get_x_new = None):
"""Initialize the model (i.e. t=lag length) with some inputs from the
containing Detector object. The padding_column arguments are only
needed for the demo Csurf object. This method differs from object
instantiation/creation, as it processes the very first (collection of)
observation(s), thus creating the objects and quantities we will trace
through time.
NOTE I: The exo_selection list is applied inside detector, so X_exo
will already contain everything relevant
NOTE II: The tag #QR ADAPTION means that the lines following the tag
could/can be adapted to QR-updating (rather than Woodbury)
X_endo: S1xS2x(L+1) numpy array, float:
is the S1xS2x(L+1) array of the last L observations before t
as well as the observation at t at position L.
Y_2: S1xS2 np array, float:
will be endogeneous regressors at time t+1, which means Y_t.
X_exo: S1xS2xnum_exo np array, float:
will contain exogeneous variables at time t (NOT IMPLEMENTED)
X_exo_2: S1xS2xnum_exo np array, float:
will contain exogeneous variables at time t+1 (NOT IMPLEMENTED)
cp_model: CpModel object:
gives the hazard function inside an object
model_prior: float:
Passes the prior of the Detector object into the model object
padding_columns_computeXX, padding_column_get_x_new:
deprecated, leave None.
"""
print("Initializing BVAR DPD object")
"""STEP 1: Take the data-stream that was partitioned appropriately
inside the Detector object and reshape/rename it for further processing
Y1 = Y_t, Y2 = Y_{t+1}, X1_endo = Y_1:t-1, with t = L-1."""
Y1 = X_endo[-1,:].flatten()
Y2 = Y_2.flatten()
X1_endo = X_endo[:self.lag_length,:].reshape(self.lag_length,
self.S1, self.S2)
"""In case there are no exogeneous variables in this model, take
the relevant precautions."""
if self.exo_bool:
#RESHAPE will not corr. to real dims of exo vars
X1_exo = (X_exo[-1,:,:].reshape(
self.num_exo_regressors, self.S1, self.S2))
else:
X1_exo = None
"""STEP 2: Format the quantities we wish to trace through time (i.e.
typically sufficient statistics), and correctly compute them using
neighbourhood structure"""
"""STEP 2.1: Quantities for time point t, i.e. dimension does not
depend on how many run-lengths we retain.
Quantities will hold:
XX
Y_1:t-1'Y_1:t-1, i.e. the cross-product of regressors at time t.
XY
Y_1:t-1'Y_t, i.e. the cross-product of regressors and obs. at t
X_t
Y_1:t-1, i.e. regressors at time t
X_tp1
Y_2:t, i.e. regressors at time t+1 ('t plus (p) 1')
YY
Y_t'Y_t, i.e. observation cross-product
"""
self.XX = np.zeros(shape=(self.num_regressors,self.num_regressors))
self.XY = np.zeros(self.num_regressors)
self.X_t = np.zeros(shape=(self.S1*self.S2, self.num_regressors))
self.X_tp1 = np.zeros(shape=(self.S1*self.S2, self.num_regressors))
self.YY = np.inner(Y1, Y1)
"""STEP 2.2: Create structures holding the last VB_window_size
cross-prods XX and YY. Also create structures holding the variational
parameter estimates.
Quantities will hold;
XX_t
At time t, r-th entry holds the cross-product of X_{t-r}'X_{t-r}
for any r < VB_window_size, so dimension is
VB_window_size x num_regressors x num_regressors
NOTE: VERY DIFFERENT from the XX_rt in super class BVAR_NIG, which
contains the SUMS over cross-products from t-r to t!
YY_t
As XX_t, but with Y'Y, so its dimension is VB_window_size x 1
NOTE: VERY DIFFERENT from YY_rt in BVAR_NIG, same reason as above
"""
self.XX_t = np.zeros(shape=(self.VB_window_size,
self.num_regressors, self.num_regressors))
self.YY_t = np.zeros(self.VB_window_size)
self.XY_t = np.zeros((self.VB_window_size, self.num_regressors))
# self.LBFGSB_anchor = np.zeros((self.VB_window_size,
# int(self.num_regressors * (self.num_regressors + 1) * 0.5) +
# self.num_regressors + 2))
self.XX_t[0,:,:] = self.XX
self.XY_t[0,:] = self.XY
self.YY_t[0] = self.YY
"""STEP 2.3: Inverse and Determinant related quantities
#DEBUG: We will want to implement efficient updating for the inv/det
# of (X'X* alpha + Sigma^-1) when Sigma^-1 changes!
# should be doable via Woodbury or something, but I will brute
# force it for the time being.
# The gain would be that one needs to have an S1*S2 x S1*S2
# inversion for the update, instead of pxp (which presumably
# will be larger)
"""
#DEBUG: FILL IN IF THERE IS A COMPUTATIONALLY EFFICIENT WAY
"""STEP 2.4: Variational parameters at time point t and for run-
length r, i.e. dimension depends on how many run-lengths one retains
Quantities will hold:
beta_rt
beta_rt[r,:] stores the variational beta param, being the
MAP-estimate of the approximate posterior at time t, run-length r.
Dimension is num_run_lengths x num_regressors
L_rt
L_[r,:,:] stores the Cholesky decomposition of the inverse variance
matrix, i.e. L[r,:,;]L[r,:,:]' = Sigma^{-1}. Here Sigma corresponds
to the variational posterior covariance matrix estimate at time t
for run-length r.
Dimension is num_run_lengths x num_regressors x num_regressors
a_rt
a_rt[r] stores the variational param for the Inverse Gamma (IG) a
b_rt
b_rt[r] stores the variational param for the IG b
"""
self.beta_rt = np.zeros(shape=(2,self.num_regressors))
self.L_rt = np.zeros(shape = (2, self.num_regressors,
self.num_regressors))
self.a_rt = np.zeros(2)
self.b_rt = np.zeros(2)
"""Initialize a_1, b_1, L_1, beta_1 with the priors. This
means taking them as starting values for optimization"""
self.a_rt[0], self.b_rt[0] = self.a, self.b
self.beta_rt[0,:] = self.prior_mean_beta
self.successful_terminations = np.array([False, False])
"""STEP 2.5: Retained run lengths, storing which run-lengths you retain
at time t. Careful with this, as retained_run_lengths[i] = j means that
the i-th longest run-length you retain is j"""
self.retained_run_lengths = np.array([0,0])
"""STEP 3: Compute prior- and data-dependent quantities:
Computation of X_t, X_tp1, X'X, X'Y, and Y'Y from scratch."""
"""STEP 3.1: Gives X_t, X'X, Y'Y"""
self.compute_X_XX_XY_YY( Y1, X1_endo, X1_exo,
padding_columns_computeXX,
compute_XY = True)
"""STEP 3.2: Gives X_{t+1}"""
self.X_tp1 = self.get_x_new(Y2, X_exo_2 ,1,padding_column_get_x_new)
"""STEP 3.3: Fill the first entry of XX_t, YY_t"""
self.XX_t[0,:,:], self.YY_t[0] = self.XX, self.YY
self.XY_t[0,:] = self.XY
"""STEP 4: Using the results of STEP 3, compute some computationally
burdensome results, like XX_rt's inverses and prior inv + det"""
"""STEP 4.1: Computation of the prior inverse, which will be needed
at each iteration to inform the chaingepoint probabilities"""
self.L_0_inv = np.linalg.cholesky(self.prior_var_beta)
#syntax: the 1 stands for 'lower triangular', since the np cholesky
# decomposition returns lower triangular matrices
self.L_0 = scipy.linalg.lapack.clapack.dtrtri(self.L_0_inv, 1)[0]
self.prior_var_beta_inv = np.matmul(np.transpose(self.L_0),
(self.L_0))
self.L_rt[0,:,:] = self.L_0
"""STEP 5: Get L_1, b_1, a_1, beta_1 using our optimization. Set the
parameters for run-length >0 equal to those for run-length = 0"""
#NOTE: Wrong cp_prob, but basically amounts to prior_weight = 1 for SGD
self.anchor_params = [0] # EXTENDED in the first iteration
#self.SVRG_anchor = []
self.SVRG_anchor_gradient_indices = [np.array([0])] #NOT extended in the first iteration
self.SVRG_anchor_sum = [np.zeros(int(self.num_regressors*
(self.num_regressors+1)*0.5) + self.num_regressors + 2)] #NOT extended in the first iteration
self.SVRG_anchor_gradients = [np.array([0])] #NOT extended in the first iteration
self.full_VB_optimizer(r = 0, num_obs = 1, t=self.lag_length + 1,
cp_prob = 1.0, alpha = self.alpha_param)
"""Take care of r=t-1 & r>t-1 manually (usually done in the loops
later on)"""
self.anchor_params[-1] = self.anchor_params[0].copy() #r=t-1 & r>t-1
self.SVRG_anchor_gradient_indices[0] = (
self.SVRG_anchor_gradient_indices[-1].copy())
self.SVRG_anchor_sum[-1] = self.SVRG_anchor_sum[0].copy()
self.SVRG_anchor_gradients[-1] = self.SVRG_anchor_gradients[0].copy()
#print("LBFGSB anchor", self.LBFGSB_anchor)
self.a_rt[1], self.b_rt[1] = self.a_rt[0], self.b_rt[0]
self.beta_rt[1,:] = self.beta_rt[0,:]
self.L_rt[1,:,:] = self.L_rt[0,:,:]
#DEBUG: We also need to initialize VB params for alpha +/- eps!
"""STEP 6: Use the parameters to get the value of the predictive
distribution for RL = 1"""
"""COMPUTATIONAL NOTE:
PossiblyInefficient, probably: Can we not use that we have the Cholesky
decomposition of V^-1 and (I + XVX')^-1 = I - X(V^-1 + X'X)^-1X'
to somehow update (V^-1 + X'X)^-1 using V^-1's cholesky decomp
plus some updating?
See here: https://stackoverflow.com/questions/8636518/
dense-cholesky-update-in-python?rq=1
and this c-based module: https://github.com/modusdatascience/choldate
NOTE: Will be O(S * p^2), so probably better to use direct inversion,
since we can either invert directly O(S^3) or via Woodbury (O(p^3))"""
L_inv = scipy.linalg.lapack.clapack.dtrtri(self.L_rt[0,:,:], 1)[0]
XL = np.matmul(self.X_t, L_inv)
"""Note: Q_0 and R_0 are the decomposition of the posterior predictive
variance (without the scaling factor a/b)"""
Q_0, R_0 = np.linalg.qr(
np.identity(self.S1*self.S2)
+ np.matmul(
XL, np.transpose(XL)
)
)
#syntax: the 0 stands for 'upper triangular', since the QR
# decomposition returns upper triangular matrices
R_0_inv = scipy.linalg.lapack.clapack.dtrtri(R_0, 0)[0]
C_0_inv = (self.a_rt[0]/self.b_rt[0]) * (
#inverse of QR = R^-1 * Q^T, as Q orthogonal
np.matmul(R_0_inv, np.transpose(Q_0))
)
"""LINEAR ALGEBRA NOTE:
Q is unitary, so |det(Q)| = 1. Also, we have a positive definite
matrix, so det(QR) > 0. Note also that we want the log determinant of
covariance matrix, not of its inverse!"""
C_0_log_det = ((self.S1*self.S2) *
(np.log(self.b_rt[0]) - np.log(self.a_rt[0]))
+ abs(np.sum(np.log(np.diag(np.abs(R_0))))))
"""STEP 6.1: This step ensures that we center the MVT at zero, which
makes the computations inside mvt_log_density easier"""
resid = Y1 - np.matmul(self.X_t, self.beta_rt[0,:])
"""STEP 6.2: For the first observation, the predictive probability and
model evidence are equivalent, as the model evidence is computed under
posterior beliefs (captured by VB parameters) only."""
self.model_log_evidence = ( np.log(model_prior) +
BVARNIG.mvt_log_density(resid, C_0_inv, C_0_log_det,
2*self.a_rt[0],
self.non_spd_alerts))
"""STEP 9.2: Multiply the model evidence by the hazard rate/cp prior
as well as the model prior to get the joint log probs for run-length
equalling 0 or being >0 (i.e., first CP occured before first obs)"""
"""Numerical stability: Ensure that we do not get np.log(0)=np.inf
by perturbation"""
if cp_model.pmf_0(1) == 0:
epsilon = 0.000000000001
else:
epsilon = 0
"""get log-probs for r_1=0 or r_1>0. Typically, we assume that the
first observation corresponds to a CP (i.e. P(r_1 = 0) = 1),
but this need not be the case in general."""
r_equal_0 = (self.model_log_evidence +
np.log(cp_model.pmf_0(0) + epsilon))
r_larger_0 = (self.model_log_evidence +
np.log(cp_model.pmf_0(1)+ epsilon))
self.joint_log_probabilities = np.array([r_equal_0, r_larger_0])
"""STEP 8.3: Get the derivative of the log probs, too, just
initialize to 1 (since log(1) = 0), initialize with 2 columns
(for 2 hyperparams: a,b). We may wish to extend this to more params"""
self.model_specific_joint_log_probabilities_derivative = np.ones((2,2))
self.model_specific_joint_log_probabilities_derivative_sign = np.ones(
(2,2))
"""STEP 8.4: Similar to 8.3, but for alpha-optimization. Hence we only
do this if we have to"""
if self.alpha_rld_learning:
self.log_alpha_derivatives_joint_probabilities = None #np.ones(3)
self.log_alpha_derivatives_joint_probabilities_sign = None #np.ones(3)
def full_VB_optimizer(self, r, num_obs, t, cp_prob, alpha = None):
"""run_length gives you the run-length of the parameters we want to
optimize. I.e., if run_length = r, then we optimize using the last r+1
entries of self.XX_t.
Note: The num_obs variable gives the VALUE of the run- length, whiile
r gives its INDEX in all our parameters. (E.g., we could have discarded
run-length 2, in which case r=2 will refer to run-length 3, thus the
corresponding num_obs = 3.
NOTE: This function will overwrite a_rt, b_rt, beta_rt, L_rt, for
finite-difference based hyperparameter optimization for alpha_param,
you will need to save the old values for a, b, beta, L first!
"""
# """STEP 1: Get the current values of the variational params"""
# self.old_a, self.old_b, self.old_beta, self.old_L = (
# self.a_rt[r], self.b_rt[r], self.beta_rt[r,:], self.L_rt[r,:,:])
"""STEP 2: For each variable to be optimized, take a step into the
current direction along the gradient until we are either
close enough to the optimum or have taken max_iter steps. Inside
each of the optimizer functions, we compute:
(1) new step sizes (unless count = 0, in which case we take some
default step size that is not dependent on old gradients and
old values)
(2) new gradient values, stored in self.old_gradient_a,
self.old_gradient_b, self.old_gradient_beta,
self.old_gradient_L
Note: max_count = maximum iterations if LOCAL optimizer used
count = current iteration count if LOCAL optimizer used
increment = current increment if LOCAL optimizer used
eps = min. amount of change we need to make another
iterations if LOCAL optimizer used
local_opt = whether to use LOCAL optimizer (NOT RECOMMENDED)
LBFGSB = whether to use L-BFGS-B as implemented in python
(RECOMMENDED)
"""
if alpha is None:
alpha = self.alpha_param
"""STEP 2.2: Set up quantities that will be needed in the calls to
gradient functions"""
p = self.num_regressors
eps = pow(10,-5)
self.bounds = ([( 1+ eps, self.a + int(num_obs*0.75))] +
[(1+ eps, None)] + [(None, None)]*(p + int(p*(p+1)*0.5)))
# print("r", r)
# print("SVRG gradients", self.SVRG_anchor_gradients)
"""STEP 2.2: Use the setup to run the optimization as required"""
need_full_opt = False
if not self.full_opt_thinning_schedule is None:
need_full_opt = (
(self.retained_run_lengths[r] == 0)
or
((self.retained_run_lengths[r] in
self.full_opt_thinning_schedule) and
(self.retained_run_lengths[r] < self.VB_window_size)))
elif not self.full_opt_thinning is None:
need_full_opt = (
(self.retained_run_lengths[r] % self.full_opt_thinning == 0)
and (self.retained_run_lengths[r] < self.VB_window_size))
if self.retained_run_lengths[r] == self.first_full_opt:
need_full_opt = True
#if ((self.retained_run_lengths[r] % self.full_opt_thinning == 0 or
# self.retained_run_lengths[r] == self.first_full_opt) and
# self.retained_run_lengths[r] < self.VB_window_size):
if need_full_opt:
# or self.retained_run_lengths[r]==0):
#Note: If r=0, then we want global opt.
"""Uses python's internal optimization (from scipy library)"""
self.precomputations_VB_optimizer(r, num_obs, alpha, True)
"""max_iter = max number of iterations, max_ls_iter = max number
of iterations in line search part, maxcor = maximum number of terms
used for storing the approximation to the Hessian"""
#print("LBFGSB, r = ", r)
max_iter, max_ls_iter, maxcor = 15000, 20, 10 #default values
self.LBFGSB_optimizer(r=r,num_obs=num_obs,alpha = alpha, t=t,
#params = old_params, *args,
max_iter=max_iter, max_ls_iter=max_ls_iter, maxcor=maxcor)
#elif ((self.retained_run_lengths[r]) % self.full_opt_thinning != 0
# and self.retained_run_lengths[r] <= self.VB_window_size):
elif (self.retained_run_lengths[r] <= self.VB_window_size):
self.SGD_VB_optimizer(r, num_obs, t, cp_prob, alpha)
#if (r) % self.full_opt_thinning == 0 or r ==1:
# print("SVRG, r = ", r)
#DEBUG: This still does not do the right thing with the indices!
# self.SVRG_VB_optimizer(r,num_obs,t,cp_prob,alpha,
# anchor_approx_goodness=min(
# self.VB_window_size-1,
# self.anchor_batch_size_SVRG),
# SCSG = False)
else: #((self.retained_run_lengths[r]) % self.full_opt_thinning != 0
# and self.retained_run_lengths[r]+1 > self.VB_window_size):
#self.SGD_VB_optimizer(r, num_obs, t, cp_prob, alpha)
#DEBUG: We need r+1 because the entire SVRG function is coded up
# to basically take r-1
"""NOTE: Unclear if this should be called before or after opt"""
# print("r", r)
# self.SVRG_anchor_gradients = [
# e[np.where(e < self.VB_window_size)] for e in
# self.SVRG_anchor_gradients ]
# self.SVRG_anchor_gradient_indices = [
# e[np.where(e < self.VB_window_size)] for e in
# self.SVRG_anchor_gradient_indices ]
#DEBUG: We need to also prune the gradients themselves!
# print("VB windo size", self.VB_window_size)
#print("self.SVRG_anchor_gradient_indices", self.SVRG_anchor_gradient_indices)
#print("max", np.max(np.array([np.max(e) for e in self.SVRG_anchor_gradient_indices])))
self.SVRG_VB_optimizer(r,num_obs,t,cp_prob,alpha,
anchor_approx_goodness=min(
self.VB_window_size-1,
self.anchor_batch_size_SCSG),
SCSG = True)
#now we need to make sure that the gradient indices < window size
if self.retained_run_lengths[r] == 0 and self.N_j is not None:
# print("N_j", self.N_j)
# print("r", r)
# print("retained rls", self.retained_run_lengths)
rl_num = len(self.retained_run_lengths)
# if self.retained_run_lengths[-1] == self.retained_run_lengths[-2]:
# rl_num = rl_num-1
if len(self.N_j) < rl_num:
self.N_j = np.insert(self.N_j,0, 0)
#self.SCSG_VB_optimizer(r,num_obs,t,cp_prob,alpha)
def SVRG_VB_optimizer(self, r, num_obs, t, cp_prob,alpha,
#params, *args,
anchor_approx_goodness = None,
SCSG = False):
"""Implements the variance-reduction SGD modification proposed by
Johnson & Zhang (NIPS, 2013) for the first stage of optimization where
we use the full VB optimizer as an anchoring point/variance reduction
point
Note: anchor_approx_goodness gives the number of terms you use for
your 'anchor', i.e. for the gradient approximation that keeps
SGD from exploring too much of our space, thus reducing its
variance
"""
if SCSG == False:
r_ = r -1
elif SCSG == True:
r_ = r #Note: SCSG is only entered for r > window_size!
run_length = self.retained_run_lengths[r_]
#+1
p = self.num_regressors
#REMARK: If SCSG = True, we create N_j!
if SCSG == True and self.N_j is None:
run_length_num = len(self.retained_run_lengths)
if self.retained_run_lengths[-1] == self.retained_run_lengths[-2]:
self.N_j = np.zeros(run_length_num)
else:
self.N_j = np.zeros(run_length_num+1)
# if SCSG:
# print("N_j", self.N_j)
# print("r_", r_)
# print("retained rls", self.retained_run_lengths)
# elif (SCSG == True and
# r_ == self.VB_window_size-1):
# self.N_j = np.insert(self.N_j, 0, 0)
#print(self.N_j)
# elif SCSG == True and self.N_j is not None and r_ == 0:
# #add a new entry
# self.N_j = np.insert(self.N_j, 0, 0)
#REMARK: Extend the objects fo r=0!
"""STEP 1: If this is the first step after a full optimization, you
need to do some extra work"""
#REMARK: Unchanged under SCSG!
#REMARK: Unclear if we need to change this under SCSG (depends on
# how/when we set our new 'maximum'! If we again set it at
# run_length % thinning == 0 (and do nothing else there), this
# block can remain unchanged.)
#REMARK: If we work with N_j[r], then this will simply be entered if
# we have that N_j[r] == 0!
last_full_opt = False
if not self.full_opt_thinning_schedule is None:
last_full_opt = ((run_length - 1) in
self.full_opt_thinning_schedule)
elif not self.full_opt_thinning is None:
last_full_opt = (
(run_length - 1) % self.full_opt_thinning == 0)
#if self.retained_run_lengths[r] == self.first_full_opt:
# last_full_opt = True
if ( #((run_length - 1) % self.full_opt_thinning == 0)
last_full_opt or
(SCSG == True and self.N_j[r_] == 0)):
"""STEP 1.1: Draw a sample from the retained observations,
typically using the entire window size.
Note: We fill XX_t, XY_t, ... from the front
"""
#REMARK: For SCSG, we would need the anchor_approx_goodness here
# s.t. we can just sample our batch size for the variance
# reduction.
# if SCSG:
# print("reanchoring entered")
if anchor_approx_goodness is None:
rl_ = min(run_length, self.VB_window_size-1)
indices = np.linspace(0,rl_, rl_+1, dtype = int)
else:
num_ind = min(min(anchor_approx_goodness, run_length),
self.VB_window_size-1)
rl_ = min(run_length, self.VB_window_size-1)
indices = np.random.choice(rl_, size = num_ind,
replace = False)
#DEBUG: If we want to do original SVRG, we have to
#use the entire sample, but there are modifications men-
#tioned by M. Jordan using only a subsample.
"""STEP 1.2: Store the drawn indices to make sure that you can
still access them for STEP 2 at later SVRG iterations.
Note: We will have to shift these indices backwards for each new,
observation, so as to retain the correct data point for an index"""
#DEBUG: I need this to be a list with at most window_size entries
# if run_length == 1:
# self.SVRG_anchor_gradient_indices.insert(0,indices)
# else:
#print("indices shape", indices.shape)
# print("self.SVRG_anchor_gradient_indices[r_] init BEFORE", self.SVRG_anchor_gradient_indices[r_])
# print("self.SVRG_anchor_gradient_indices init BEFORE", self.SVRG_anchor_gradient_indices)
self.SVRG_anchor_gradient_indices[r_] = indices
# print("self.SVRG_anchor_gradient_indices[r_] init AFTER", self.SVRG_anchor_gradient_indices[r_])
# print("self.SVRG_anchor_gradient_indices init AFTER", self.SVRG_anchor_gradient_indices)
"""STEP 1.3: Store the gradients for that sample & compute
the average of the stored gradients (if you store the entire window
this doesn't need to be computed, you can just retrieve it from
the optimization object created in LBFGSB_optimizer)"""
"""STEP 1.3.1: Repackage params for the grad_ELBO function"""
# lower_ind = np.tril_indices(p,0)
# params = np.zeros(int(p*(p+1)*0.5) + p + 2)
# params[0] = max(1.00001, self.a_rt[r])
# params[1] = max(1.00001, self.b_rt[r] )#+
# params[2:(p+2)] = self.beta_rt[r,:]
# params[(p+2):] = self.L_rt[r,:,:][lower_ind]
#REMARK: Maybe we should change the name from LBFGSB_anchor to just
# 'anchor', and then change the anchor within SVRG later on
# in the iterations
params = self.anchor_params[r_]
"""STEP 1.3.2: Package remaining arguments in args"""
prior_weight = 1.0/np.size(self.SVRG_anchor_gradient_indices[r_])
args = (self.a, self.b, self.prior_mean_beta, self.prior_var_beta_inv,
alpha, len(self.SVRG_anchor_gradient_indices[r_]), p,
self.K, self.E3, self.Rinv, self.E2_m_ba,
self.digamma_diff_1, self.digamma_diff_2, self.gamma_ratio_2,
self.downweight1, self.downweight2,
self.XX_t, self.XY_t, self.YY_t,
self.L_0, self.S1*self.S2, prior_weight,
self.SVRG_anchor_gradient_indices[r_], False)
#false = we do not want the sum!
"""STEP 1.3.3: Perform the call to the negative ELBO gradient and
store the individual gradients as well as their sum"""
grads = BVARNIGDPD.get_grad_neg_ELBO_full(
params,*args)
a_sum = (np.sum(grads,axis=0)/np.size(
self.SVRG_anchor_gradient_indices[r_]))
self.SVRG_anchor_gradients[r_] = grads
self.SVRG_anchor_sum[r_] = a_sum
"""STEP 2: Take SGD steps with variance reduction as in Johnson & Zhang
(NIPS, 2013) with the minor modification that we always use the
most recent observation.
NOTE: We may want to take multiple SGD steps per time period, in which
case we have to specify another input at BVARNIGDPD level.
"""
#REMARK: Under SCSG, this has to be b_j, so it can/should still be
# constant. Given their recommendations, probably b_j = 2 or 3
# would be best (s.t. you don't have complete ordering)
num_SGs = min(self.SGD_approx_goodness,
min(run_length, self.VB_window_size))
num_steps = min(run_length, min(self.VB_window_size, 1)) #DEBUG: Let this potentially be another input!
#DEBUG: Choose this smart, as in the papers
#REMARK: should be b_j/B_j ^ -(2/3)
#if SCSG == True:
step_size = (pow(num_SGs/anchor_approx_goodness, 2.0/3.0))#*
#np.ones(int(p*(p+1)*0.5)+2+p))
#step_size[0] = ab_scaling*(1/beta_l_downscaling)* step_size[0]
#step_size[1] = ab_scaling*(1/beta_l_downscaling)* step_size[1]
#elif SCSG == False:v
# step_size = pow(num_SGs/self.VB_window_size, 2.0/3.0) #unclear what I should go for here
lower_ind = np.tril_indices(p,0)
#REMARK: If we want to have step_count > 1, then for SCSG, we have the
# additional challenge of splitting up the steps between
# different observations. Probably easiest would be to set an
# object-internal counter (for each run-length) which is de-cre
# mented for each SGD step taken s.t. you can extend it over
# multiple observations
if SCSG == True:
if self.N_j[r_] == 0:
self.N_j[r_] = (np.random.geometric( #p = 1/100,
p = anchor_approx_goodness/(anchor_approx_goodness + num_SGs),
size= 1))
num_steps_this_cycle = int(min(self.N_j[r_], num_steps))
elif SCSG == False:
num_steps_this_cycle = num_steps #SVRG
#if num_steps > self.N_j:
#if num_steps_this_cyle == self.N_j
for step_count in range(0, num_steps_this_cycle):
"""STEP 2.0: Decrement N_j by one"""
if SCSG == True:
self.N_j[r_] = self.N_j[r_] - 1
"""STEP 2.1: Get the indices & gradients of SGs used next step"""
#if SCSG == False:
# max_ind = min(run_length,self.VB_window_size)
#elif SCSG == True:
max_ind = self.SVRG_anchor_gradients[r_].shape[0]#min(run_length,
#min(self.VB_window_size-1, max(anchor_approx_goodness, 1)))
indices = np.random.choice(max_ind, size = min(num_SGs, max_ind),
replace = False) #random draw of size num_SGs + most recent one
"""STEP 2.2: do the num_SG computations for parameter update"""
#DEBUG: Unclear if we should take this step
"""STEP 2.2.2: Update mu (i.e. the anchor sum) by evaluating
the new gradient on the anchor optimum, stored in LBFGSB_anchor
Note: Only needs to be done in the very first SGD batch
Note: 1 gradient eval.
"""
if step_count == 0:
indices[0] = 0 #i.e. for the first SGD batch, we always want
#the most recent obs. to be included.
"""STEP 2.2.2.1: Compute the gradients of new observation
and store into the SRVG anchor gradients.
Note: Since we invoke precomputations, the args will be
different after 2.2.2.1, so we have to declare them
here. They will also be different from those under
2.2.3 onwards.
"""
"""Note that we will want to keep the prior weight of the
average gradient (the SVRG_anchor_sum) at 1 throughout"""
grad_size = np.size(self.SVRG_anchor_gradients[r_])
w = 1/(grad_size + 1)
prior_weight = w #as we have a full prior in mu already
args_new_obs = (self.a, self.b, self.prior_mean_beta,
self.prior_var_beta_inv,
alpha, 1, p,
self.K, self.E3, self.Rinv, self.E2_m_ba,
self.digamma_diff_1, self.digamma_diff_2,
self.gamma_ratio_2,
self.downweight1, self.downweight2,
self.XX_t, self.XY_t, self.YY_t,
self.L_0, self.S1*self.S2, prior_weight,
np.array([0]), False)
"""Note: because num_obs = 1 (i.e. specified_indices =
np.array([0])), this will be the gradient for the last obs
+ 1/(n+1) * prior gradient of last obs"""
new_obs_grad = BVARNIGDPD.get_grad_neg_ELBO_full(
self.anchor_params[r_], *args_new_obs).flatten()
"""STEP 2.2.2.2: Update mu (SVRG_anchor_sum)"""
self.SVRG_anchor_sum[r_] = (
(grad_size*w)*self.SVRG_anchor_sum[r_]
+ w * new_obs_grad
)
#print("self.SVRG_anchor_sum[r]", self.SVRG_anchor_sum[r])
"""STEP 2.2.2.3: Update the SVRG_anchor_gradients with the
new gradient term """
#print("new obs grad", new_obs_grad)
self.SVRG_anchor_gradients[r_] = np.insert(
self.SVRG_anchor_gradients[r_],
0,
new_obs_grad,
axis=0
)
#print("self.SVRG_anchor_gradients[r_]", self.SVRG_anchor_gradients[r_])
#print("AFTER grads:", self.SVRG_anchor_gradients[r_])
"""Note that 0 is just the newest index at this time
point (which we have added in this very step 2.2.2),
implying in particular that the index of all other
gradients is shifted forward by one"""
self.SVRG_anchor_gradient_indices[r_] = np.insert(
self.SVRG_anchor_gradient_indices[r_] + 1,
0,
0)
"""STEP 2.2.3: Get NEW gradients corresponding to indices
we randomly drew (stored in indices)
Note: num_SG gradient evaluations
"""
"""STEP 2.2.3.1: Get the params and args for gradient call"""
p = self.num_regressors
#lower_ind = np.tril_indices(p,0)
params = np.zeros(int(p*(p+1)*0.5) + p + 2)
params[0] = max(1.00001, self.a_rt[r])
params[1] = max(1.00001, self.b_rt[r] )#+
params[2:(p+2)] = self.beta_rt[r,:]
params[(p+2):] = self.L_rt[r,:,:][lower_ind]
#print("indices", indices)
C = 1/(1 - (1/(1 + cp_prob))) # = 1/(1-p)
prior_weight = (1.0/C) * pow(1/(1 + cp_prob), run_length+1)
args_new_grad = (self.a, self.b, self.prior_mean_beta,
self.prior_var_beta_inv,
alpha, np.size(indices), p,
self.K, self.E3, self.Rinv, self.E2_m_ba,
self.digamma_diff_1, self.digamma_diff_2,
self.gamma_ratio_2,
self.downweight1, self.downweight2,
self.XX_t, self.XY_t, self.YY_t,
self.L_0, self.S1*self.S2, prior_weight,
indices, True)
"""STEP 2.2.3.2: Perform gradient call & average result"""
#DEBUG: Recomputes the precomputation!
new_batch_grad = ((1/np.size(indices))
*BVARNIGDPD.get_grad_neg_ELBO_full(params, *args_new_grad))
"""STEP 2.2.4: Get OLD gradients corresponding to indices
we randomly drew, including the new one under 2.2.2
Note: NO evaluations, get them from memory!
"""
# if SCSG:
# print("ind before old batch", indices)
# print("r_ at old batch", r_)
# print("rl", self.retained_run_lengths[r_])
# print("all rls", self.retained_run_lengths)
# print("SVRG anchor grads at r_", self.SVRG_anchor_gradients[r_])
# print("all SVRG anchor grads", self.SVRG_anchor_gradients)
try:
old_batch_grad = np.mean(
self.SVRG_anchor_gradients[r_][indices], axis=0)
except IndexError:
print("ind before old batch", indices)
print("r_ at old batch", r_)
print("rl", self.retained_run_lengths[r_])
print("all rls", self.retained_run_lengths)
print("SVRG anchor grads at r_", self.SVRG_anchor_gradients[r_])
#print("all SVRG anchor grads", self.SVRG_anchor_gradients)
raise
"""STEP 2.2.5: Get the increment by subtracting old and adding
new gradient averages + anchor sum"""
increment = step_size * (new_batch_grad - old_batch_grad +
self.SVRG_anchor_sum[r_])
#scale a,b, beta, L differently
ab_scaling = 2.5
beta_scaling = 0.5
L_scaling = 1
#increment[ = beta_l_scaling * increment
increment[0] = ab_scaling*increment[0]
increment[1] = ab_scaling*increment[1]
increment[2:(p+2)] = beta_scaling * increment[2:(p+2)]
increment[(p+2):] = L_scaling * increment[(p+2):]
#print(increment)
"""Make sure that everything is within bounds when we take
the step"""
# print("old", old_batch_grad)
# print("new", new_batch_grad)
# #print("self.SVRG_anchor_sum[r]", self.SVRG_anchor_sum[r])
# print("increment", increment)
# print("bounds", self.bounds[0][0])
if self.a_rt[r] - increment[0] > self.a + int(0.75*
self.retained_run_lengths[r]):
"""get ratio of how much a-gradient result would be more efficient
than KL + modify gradients by downweighting accordinly"""
ratio = (self.a_rt[r] + increment[0])/(self.a +
int(0.5*self.retained_run_lengths[r]))
increment[0] = increment[0] * (1.0/ratio)
increment[1] = increment[1] * (1.0/ratio)
if self.a_rt[r] - increment[0] < self.bounds[0][0]:
ratio = (self.a_rt[r] + increment[0])/self.bounds[0][0]
increment[0] = increment[0] * (1.0/ratio)
increment[1] = increment[1] * (1.0/ratio)
if self.b_rt[r] - increment[1] < self.bounds[1][0]:
increment[1] = self.bounds[1][0] - self.b_rt[r]
"""STEP 2.2.6: Perform the update, i.e. take a step and re-
convert everything into matrix/vector objects"""
if np.isnan(increment).any():
print("We have a nan increment at position", np.where(increment == True))
self.a_rt[r] = max(self.a_rt[r] - increment[0], 1.0001)
self.b_rt[r] = max(self.b_rt[r] - increment[1], 1.0001)
self.beta_rt[r,:] = self.beta_rt[r,:] - increment[2:(p+2)]
self.L_rt[r,:][lower_ind] = (self.L_rt[r,:][lower_ind] -
increment[(p+2):])
if step_count == 0 and SCSG == True:
"""We need to prune the too big values for SCSG"""
#if SCSG == True:
#DEBUG: incorrect.
#print("before trimming:", self.SVRG_anchor_gradients[r_])
l = len(self.SVRG_anchor_gradient_indices)
self.SVRG_anchor_gradients = [
self.SVRG_anchor_gradients[i][
np.where(self.SVRG_anchor_gradient_indices[i] <
self.VB_window_size)] for i in range(0,l)
]
# self.SVRG_anchor_gradients = [
# e[np.where(e < self.VB_window_size)] for e in
# self.SVRG_anchor_gradients ]
self.SVRG_anchor_gradient_indices = [
e[np.where(e < self.VB_window_size)] for e in
self.SVRG_anchor_gradient_indices ]
#print("after trimming:", self.SVRG_anchor_gradients[r_])
#REMARK: For SCSG, we would need to have another step here,
# namely the updating of our 'best estimate' every
# N_j steps (where N_j is random).
if SCSG == True and self.N_j[r_] == 0:
"""update the anchor. Gradient for this anchor will be computed
at next iteration!"""
self.anchor_params[r_][0] = self.a_rt[r_]
self.anchor_params[r_][1] = self.b_rt[r_]
self.anchor_params[r_][2:(p+2)] = self.beta_rt[r_,:]
self.anchor_params[r_][(p+2):] = self.L_rt[r_,:,:][lower_ind]
def LBFGSB_optimizer(self, r, num_obs, alpha, t,
#params, *args,
max_iter=15000, max_ls_iter=20, maxcor=10):
"""Use python's LBFGSB function to optimize while r<window_size"""
#ELBO needs maximization!
"""STEP 1: Get relevant arguments, put VB params in vector"""