In [1]:
import sys
sys.path.append('../')
sys.path.append('../skimfa')  # TODO: replace once code is a python package

import math
from skimfa.kernels import PairwiseSKIMFABasisKernel
from feature_maps import LinearFeatureMap
from fit import *

In [2]:
# Set seed for reproducibility
torch.manual_seed(32312)

<torch._C.Generator at 0x11d5740f0>

# Generate synthetic data:

- p = 500 covariates
- N = 500 training datapoints 
- First 5 covariates have main / interaction effects w/ response; remaining 495 covariates have no influence on the response
- Linear main and interaction effects
- The signal variance / total variance (i.e., R^2) equals .8

In [3]:
### Generate Covariates ###
p = 500
N_train = 500
N_test = 100
N_valid = 100

X_train = torch.normal(mean=0., std=1., size=(N_train, p))
X_test = torch.normal(mean=0., std=1., size=(N_test, p))
X_valid = torch.normal(mean=0., std=1., size=(N_valid, p))

In [4]:
### Generate Main and Interaction Effects ###
K = 5 # First 5 covariates influence response
main_effects = dict()
interaction_effects = dict()

# Generate main effects
for cov_ix in range(5):
    main_effects[cov_ix] = torch.normal(mean=1., std=1., size=(1, )).item()

# Generate 4 pairwise interaction effects between covariates  
for cov_ix1, cov_ix2 in [(0, 1), (1, 2), (2, 3), (3, 4)]:
    interaction_effects[(cov_ix1, cov_ix2)] = torch.normal(mean=1., std=1., size=(1, )).item()    

In [5]:
### Generate Response ###
def generate_noiseless_response(X, main_effects, interaction_effects):
    Y_signal = torch.zeros(X.shape[0])
    for cov_ix, effect in main_effects.items():
        Y_signal += effect * X[:, cov_ix]
    
    for cov_pair, effect in interaction_effects.items():
        cov_ix1, cov_ix2 = cov_pair
        Y_signal += effect * X[:, cov_ix1] * X[:, cov_ix2]
    
    return Y_signal

Y_train_noiseless = generate_noiseless_response(X_train, main_effects, interaction_effects)
Y_test_noiseless = generate_noiseless_response(X_test, main_effects, interaction_effects)
Y_valid_noiseless = generate_noiseless_response(X_valid, main_effects, interaction_effects)

# Add noise so that R^2 = .8
R2 = .8
approx_signal_var = Y_train_noiseless.var().item()
noise_var = (1 - R2) * approx_signal_var / R2

Y_train = Y_train_noiseless + math.sqrt(noise_var)*torch.normal(mean=0., std=1., size=(N_train, ))
Y_test = Y_test_noiseless + math.sqrt(noise_var)*torch.normal(mean=0., std=1., size=(N_test, ))
Y_valid = Y_valid_noiseless + math.sqrt(noise_var)*torch.normal(mean=0., std=1., size=(N_valid, ))

# Fit SKIM-FA Model
- Includes all main and pairwise interaction effects (linear)
- Performs variable selection
- Estimtates effects (ANOVA decomposition)

In [6]:
# Step 1: Make feature map 
# for linear interaction case, this just standardizes the covariates to be 0 mean and unit variances
# the means variances and variances are estimates from training data, and then stored for future uses
# e.g., to standardize new test data

covariate_dims = list(range(p))
covariate_types = ['continuous'] * p # irrelevant for now (in the future the selected feature map will depend on the covariate type)
linfeatmap = LinearFeatureMap(covariate_dims, covariate_types)
linfeatmap.make_feature_map(X_train) 

# Step 2: Make kernel configuration
kernel_config = dict()
kernel_config['uncorrected'] = True
kernel_config['rescale'] = 1.
kernel_config['feat_map'] = linfeatmap
kernel_config['cache'] = True
kernel_config['Q'] = 2 # all main and pairwise interaction effects

# Step 3: Make optimization configuration
optimization_config = dict()
optimization_config['T'] = 2000 # 2000 total gradient steps
optimization_config['M'] = 100 # size of cross-validation random sample
optimization_config['param_save_freq'] = 100 # save model weights every 100 iterations
optimization_config['valid_report_freq'] = 100 # how often to report MSE on validation set 
optimization_config['lr'] = .1
optimization_config['train_noise'] = False
optimization_config['noise_var_init'] = Y_train.var().detach().item()
optimization_config['truncScheduler'] = adaptive_cutoff_scheduler

In [7]:
# Fit SKIM-FA
train_valid_data = dict()
train_valid_data['X_train'] = X_train
train_valid_data['Y_train'] = Y_train
train_valid_data['X_valid'] = X_valid
train_valid_data['Y_valid'] = Y_valid

# VERY STRANGE error on my computer where need to invert some matrix to not get a segmentation 11 fault error...
import numpy as np
X_weird = np.random.normal(size=(500, 100))
np.linalg.inv(X_weird.T.dot(X_weird))

array([[ 2.49272743e-03, -1.73268019e-04,  4.81579738e-05, ...,
         3.53559434e-05, -2.12599787e-05, -2.56288452e-05],
       [-1.73268019e-04,  2.31965807e-03,  1.36295995e-04, ...,
        -7.69477693e-05, -1.38438000e-04, -8.93018936e-05],
       [ 4.81579738e-05,  1.36295995e-04,  2.61839413e-03, ...,
        -4.85305395e-06, -1.05111608e-04,  1.83291719e-04],
       ...,
       [ 3.53559434e-05, -7.69477693e-05, -4.85305395e-06, ...,
         2.46330827e-03, -3.96408811e-05, -9.03358800e-05],
       [-2.12599787e-05, -1.38438000e-04, -1.05111608e-04, ...,
        -3.96408811e-05,  2.40038823e-03,  1.15984093e-04],
       [-2.56288452e-05, -8.93018936e-05,  1.83291719e-04, ...,
        -9.03358800e-05,  1.15984093e-04,  2.52778102e-03]])

In [8]:
skimfit = SKIMFA()
skimfit.fit(train_valid_data, PairwiseSKIMFABasisKernel, kernel_config, optimization_config)

  0%|          | 7/2000 [00:00<01:06, 29.92it/s]

Mean-Squared Prediction Error on Validation (Iteration=0): 10.053
Number Covariates Selected=500


  5%|▌         | 107/2000 [00:03<00:54, 35.05it/s]

Mean-Squared Prediction Error on Validation (Iteration=100): 9.957
Number Covariates Selected=500


 10%|█         | 207/2000 [00:05<00:51, 34.52it/s]

Mean-Squared Prediction Error on Validation (Iteration=200): 7.17
Number Covariates Selected=500


 15%|█▌        | 307/2000 [00:08<00:48, 35.04it/s]

Mean-Squared Prediction Error on Validation (Iteration=300): 7.432
Number Covariates Selected=500


 20%|██        | 407/2000 [00:11<00:46, 34.01it/s]

Mean-Squared Prediction Error on Validation (Iteration=400): 7.489
Number Covariates Selected=500


 25%|██▌       | 507/2000 [00:14<00:43, 34.53it/s]

Mean-Squared Prediction Error on Validation (Iteration=500): 7.836
Number Covariates Selected=500


 30%|███       | 607/2000 [00:17<00:41, 33.48it/s]

Mean-Squared Prediction Error on Validation (Iteration=600): 6.041
Number Covariates Selected=172


 35%|███▌      | 707/2000 [00:20<00:37, 34.10it/s]

Mean-Squared Prediction Error on Validation (Iteration=700): 3.357
Number Covariates Selected=60


 40%|████      | 807/2000 [00:23<00:35, 33.71it/s]

Mean-Squared Prediction Error on Validation (Iteration=800): 2.391
Number Covariates Selected=7


 45%|████▌     | 907/2000 [00:26<00:32, 33.30it/s]

Mean-Squared Prediction Error on Validation (Iteration=900): 2.398
Number Covariates Selected=7


 50%|█████     | 1007/2000 [00:29<00:29, 33.93it/s]

Mean-Squared Prediction Error on Validation (Iteration=1000): 2.307
Number Covariates Selected=7


 55%|█████▌    | 1107/2000 [00:32<00:26, 33.67it/s]

Mean-Squared Prediction Error on Validation (Iteration=1100): 2.143
Number Covariates Selected=6


 60%|██████    | 1207/2000 [00:35<00:23, 34.18it/s]

Mean-Squared Prediction Error on Validation (Iteration=1200): 2.308
Number Covariates Selected=6


 65%|██████▌   | 1307/2000 [00:38<00:20, 33.77it/s]

Mean-Squared Prediction Error on Validation (Iteration=1300): 2.26
Number Covariates Selected=6


 70%|███████   | 1407/2000 [00:41<00:18, 32.53it/s]

Mean-Squared Prediction Error on Validation (Iteration=1400): 2.245
Number Covariates Selected=6


 75%|███████▌  | 1507/2000 [00:44<00:14, 33.71it/s]

Mean-Squared Prediction Error on Validation (Iteration=1500): 2.287
Number Covariates Selected=6


 80%|████████  | 1607/2000 [00:47<00:11, 33.69it/s]

Mean-Squared Prediction Error on Validation (Iteration=1600): 2.231
Number Covariates Selected=6


 85%|████████▌ | 1707/2000 [00:50<00:08, 33.67it/s]

Mean-Squared Prediction Error on Validation (Iteration=1700): 2.417
Number Covariates Selected=6


 90%|█████████ | 1807/2000 [00:53<00:05, 32.78it/s]

Mean-Squared Prediction Error on Validation (Iteration=1800): 2.21
Number Covariates Selected=6


 95%|█████████▌| 1907/2000 [00:56<00:02, 32.28it/s]

Mean-Squared Prediction Error on Validation (Iteration=1900): 2.212
Number Covariates Selected=6


100%|██████████| 2000/2000 [00:59<00:00, 33.61it/s]

Mean-Squared Prediction Error on Validation (Iteration=1999): 2.316
Number Covariates Selected=6





# See how well SKIM-FA did in terms of variable selection and estimation

In [18]:
# Variable selection
selected_covs = set([cov_ix.item() for cov_ix in skimfit.get_selected_covariates()])
correct_covs = set(range(K))

# 
selected_covs & correct_covs