# Early Stopping VI Tutorial

As an example, we demonstrate the use of our Early Stopping VI methodology on the following simple linear system used in LazyVI (Gao et al. 2022):

$$f(x) = 1.5x_1 + 1.2x_2 + x_3 + \epsilon$$

Where $\epsilon \sim N(0,0.1)$ and $X \sim N(0, \Sigma_{6 \times 6})$, so the response only depends on the first three of the six variables. All variables are independent except for $x_1$ and $x_2$, whose correlation is $\rho$.  

We drop $x_1$. And the true VI under negative MSE is $VI_1 = (1.5)^2(1-\rho^2)$. We show how to apply our proposed early stopping warm-start framework using 
neural network and GBDT to estimate $VI_1$. 

## Neural Network

In [21]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from nn_utils import *

import time
from sklearn.model_selection import KFold


from itertools import chain, combinations
import scipy.special

#### generate data

In [22]:
rho = 0.5
beta = np.array([1.5, 1.2, 1, 0, 0, 0])
X, Y = generate_linear_data(beta=beta, sigma= 0.1, corr=rho)


##### train full network

In [23]:

drop_i = 0

m = 128

p = beta.shape[0]

widths = [p,m,1]

X_fit, X_test, y_fit, y_test = train_test_split(X, Y,random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_fit, y_fit,random_state=1)
X_fit_drop = X_fit.clone()
X_fit_drop[:,drop_i] = torch.mean(X_fit_drop[:,drop_i])

X_train = torch.tensor(X_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
dm = FlexDataModule( X_train, y_train, X_val, y_val)


#train full model

lr = 0.1

full_nn = LazyNet(widths,lr = lr)
full_nn.reset_parameters()
early_stopping = EarlyStopping('val_loss', min_delta=1e-3)
cb = MetricTracker()
trainer = pl.Trainer(callbacks=[cb,early_stopping], max_epochs=800,enable_progress_bar=False,enable_model_summary=False)
trainer.fit(full_nn, dm)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


#### train early stopping network

In [24]:

dropS = [0]
X_train_drop, X_val_drop, X_test_drop = dropdata(X_train,X_val, X_test,dropS)
lazy_nn = LazyNet(widths,lr = 0.1)
lazy_nn.init_parameters(full_nn)


dm_lazy = FlexDataModule( X_train_drop, y_train,  X_val_drop,  y_val)
early_stopping = EarlyStopping('val_loss', min_delta=1e-3)
cb = MetricTracker()

trainer = pl.Trainer(callbacks=[cb,early_stopping], max_epochs=100,enable_progress_bar=False,enable_model_summary=False)
trainer.fit(lazy_nn, dm_lazy)
print(trainer.current_epoch)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


4


In [25]:
vi_est = torch.mean((lazy_nn(X_test_drop) - y_test)**2)  -  torch.mean((full_nn(X_test) - y_test)**2) 
vi_drop =  torch.mean((full_nn(X_test_drop) - y_test)**2)  -  torch.mean((full_nn(X_test) - y_test)**2) 
print('es nn vi ',vi_est.item())
print('drop nn vi',vi_drop.item())
print('true vi', 1.5**2*(1-rho**2))

es nn vi  1.6991448402404785
drop nn vi 2.2758593559265137
true vi 1.6875


## GBDT

In [26]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from utils_gdbt import *

import time
from sklearn.model_selection import KFold


from itertools import chain, combinations
import scipy.special

#### generate data

In [52]:
rho  = 0.5
beta = np.array([1.5, 1.2, 1, 0, 0, 0])
X, Y = generate_linear_data(beta=beta, sigma= 0.1, corr=rho)

#### train full GBDT

In [53]:

drop_i = 0 

X_fit, X_test, y_fit, y_test = train_test_split(X, Y,random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_fit, y_fit,random_state=1)
X_fit_drop = np.copy(X_fit)
X_fit_drop[:,drop_i] = np.mean(X_fit_drop[:,drop_i])

max_iter = 1000

depth = 3
lr = 0.1
es_lr = 0.1
plot = False



train_pool = Pool(X_train,y_train)
model_full= CatBoostRegressor(iterations=max_iter,
                            depth= depth,
                            learning_rate= lr,
                            random_strength= 10000,
                            loss_function='RMSE',
                            verbose=False,
                            seed = 1,
                            feature_border_type='Median',
                            score_function='L2',            
                            )
model_full.fit(train_pool,eval_set=(X_val, y_val), early_stopping_rounds = 10,plot = plot)
pre_full = np.mean((model_full.predict(X_test) - y_test)**2)

        

#### train early stopping GBDT

In [56]:
dropS = [0]

X_train_drop, X_val_drop, X_test_drop = dropdata(X_train,X_val, X_test,dropS)

vi_drop = np.mean((model_full.predict(X_test_drop) - y_test)**2) - pre_full


train_pool_red = Pool(X_train_drop,y_train)

model_es= CatBoostRegressor(iterations=max_iter,
                                            depth= depth,
                                            learning_rate= es_lr,
                                            random_strength= 10000,
                                            loss_function='RMSE',
                                            verbose=False,
                                            feature_border_type='Median',
                                            score_function='L2'
                                            )
            
model_es.fit(train_pool_red,eval_set=(X_val_drop, y_val), init_model= model_full,   early_stopping_rounds = 10,plot = plot)

     

<catboost.core.CatBoostRegressor at 0x7f8f2234e4f0>

In [57]:
vi_est =  np.mean((model_es.predict(X_test_drop) - y_test)**2) - pre_full
print('es GBDT vi ',vi_est)
print('drop GBDT vi',vi_drop)
print('true vi', 1.5**2*(1-rho**2))

es GBDT vi  1.6533495699790286
drop GBDT vi 2.0786511029884336
true vi 1.6875
