In [1]:
import os, sys
import numpy as np
sys.path.append('..')
from sklearn.linear_model.coordinate_descent import _alpha_grid
from sklearn.linear_model import Lasso, LassoLars
from utils import gen_covariance, gen_beta2, gen_data, selection_accuracy
from pyuoi.linear_model import UoI_Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model.coordinate_descent import _alpha_grid
from sklearn.linear_model import lasso_path, lars_path

### Benchmark Lasso with coordinate descent vs. LARS

In [2]:
sigma = gen_covariance(100, 0.25, 100, 5, 0.15)
beta = gen_beta2(n_features = 100, block_size = 100, sparsity = 1)
X, X_test, y, y_test, _ = gen_data(300, 100, covariance = sigma, beta = beta)

In [3]:
# Get the range of reg_params that UoI would explore
alphas = _alpha_grid(X, y, n_alphas = 50)
alphas

array([141.31204485, 122.7311182 , 106.59337207,  92.57755601,
        80.40466035,  69.83236202,  60.65020067,  52.6753891 ,
        45.74917455,  39.73367844,  34.50915164,  29.9715907 ,
        26.03066741,  22.60793071,  19.63524496,  17.05343358,
        14.81110102,  12.8636097 ,  11.17219133,   9.70317524,
         8.42731806,   7.31922158,   6.35682719,   5.52097672,
         4.79503107,   4.16453902,   3.61694951,   3.14136179,
         2.72830844,   2.36956691,   2.05799581,   1.78739277,
         1.55237095,   1.34825183,   1.17097205,   1.01700254,
         0.88327827,   0.76713723,   0.66626741,   0.57866082,
         0.5025735 ,   0.4364908 ,   0.37909723,   0.32925025,
         0.28595759,   0.24835742,   0.21570124,   0.18733898,
         0.16270604,   0.14131204])

In [6]:
%timeit r = lasso_path(X, y, alphas = alphas)

346 ms ± 12.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit lars_path(X, y.ravel(), method = 'lasso')

17.8 ms ± 220 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
# Results are very favorable for the 100 feature problem. How about 1000 features?
n_features = 1000
n_samples = 3000

sigma = gen_covariance(n_features, 0, n_features, 1, 0)
beta = gen_beta2(n_features = n_features, block_size = n_features, sparsity = 1)
X, X_test, y, y_test, _ = gen_data(n_samples, n_features, covariance = sigma, beta = beta)

In [9]:
%timeit r = lasso_path(X, y, alphas = alphas)
%timeit lars_path(X, y.ravel(), method = 'lasso')

10.4 s ± 433 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
812 ms ± 52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
# Time UoI Lasso with LARS for increasing p. Keep sparsity = 1 and introduce correlations to test the
# worst-case runtimes 

# Keep track of selection accuracy and r2 to make sure there is no drop-off in performance

n_features = np.array([100, 200, 300, 400, 500, 1000, 2000, 5000])
n_samples = 5 * n_features

r2 = np.zeros(n_features.size)
sa = np.zeros(n_features.size)
    
for i, nf in enumerate(n_features):
    sigma = gen_covariance(nf, 0, nf, 10, 0.25)
    beta = gen_beta2(n_features = nf, block_size = nf, sparsity = 1)
    X, X_test, y, y_test, _ = gen_data(n_samples[i], nf, covariance = sigma, beta = beta)
    
    uoil = UoI_Lasso()
    print('n_features: %d' % nf)
    %time uoil.fit(X, y.ravel())
    
    r2_score[i] = uoil.score(y_test, X_test)
    sa[i] = selection_accuracy(beta.ravel(), uoil.coef_.ravel())
    

n_features: 100
> /home/akumar/nse/PyUoI/pyuoi/linear_model/utils.py(122)intersection()
-> if selection_thresholds is None:
(Pdb) coefs
[array([[  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [174.34863825, 175.0113316 , 190.63860325, ..., 320.4379617 ,
        214.6633646 ,  76.23118463],
       [174.35293312, 175.02892766, 190.64056323, ..., 320.43293238,
        214.66298959,  76.24460901],
       [174.73640232, 176.24088417, 190.72268417, ..., 320.06934645,
        214.5541854 ,  77.21586773]]), array([[  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
 

(Pdb) coefs[0].shape
(103, 100)
(Pdb) coefs[1].shape
(101, 100)
(Pdb) coefs[2].shape
(105, 100)
(Pdb) coefs[3].shape
(103, 100)
(Pdb) len(coefs)
48
(Pdb) quit()


BdbQuit: 

NotFittedError: This UoI_Lasso instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

### Need to determine how to perform intersection over different LARS paths. Manually perform bootstrapping and compare the paths generated by lars: 

In [5]:
n_features = 100
n_samples = 300

sigma = gen_covariance(n_features, 0, n_features, 1, 0)
beta = gen_beta2(n_features = n_features, block_size = n_features, sparsity = 0.2)
X, X_test, y, y_test, _ = gen_data(n_samples, n_features, covariance = sigma, beta = beta)

n_boots = 10
boots = []
coefs = []
# Generate bootstrap samples
for n in range(n_boots):

    boots.append(train_test_split(np.arange(X.shape[0]), test_size = 0.1))       
    idxs_train, idxs_test = boots[n]
    X_rep = X[idxs_train]
    X_test = X[idxs_test]
    y_rep = y[idxs_train]
    y_test = y[idxs_test]
                 
    coef = lars_path(X_rep, y_rep.ravel())
    coefs.append(coef)         

In [15]:
alphas = _alpha_grid(X, y, n_alphas = 10000)

In [20]:
min(alphas)

0.011378201804767786

### Approach 1: Densely sample the solution path at regular intervals across all bootstraps

In [4]:
n_features = 500
n_samples = 1500

sigma = gen_covariance(n_features, 0, n_features, 1, 0)
beta = gen_beta2(n_features = n_features, block_size = n_features, sparsity = 1)
X, X_test, y, y_test, _ = gen_data(n_samples, n_features, covariance = sigma, beta = beta)

uoil = UoI_Lasso()
%time uoil.fit(X, y.ravel())

CPU times: user 18min 19s, sys: 3.05 s, total: 18min 22s
Wall time: 3min 10s


UoI_Lasso(comm=None, copy_X=True, eps=0.001, estimation_frac=0.9,
     estimation_score='r2', fit_intercept=True, max_iter=None,
     n_boots_est=48, n_boots_sel=48, n_lambdas=48, normalize=True,
     random_state=<module 'numpy.random' from '/home/akumar/anaconda3/envs/nse/lib/python3.6/site-packages/numpy/random/__init__.py'>,
     selection_frac=0.9, stability_selection=1.0, warm_start=None)

#### Seems to work, time against coordinate descent with warm start

In [4]:
n_features = 100
n_samples = 300

sigma = gen_covariance(n_features, 0, n_features, 1, 0)
beta = gen_beta2(n_features = n_features, block_size = n_features, sparsity = 1)
X, X_test, y, y_test, _ = gen_data(n_samples, n_features, covariance = sigma, beta = beta)

uoil = UoI_Lasso(n_lambdas = 48)
%time uoil.fit(X, y.ravel())

CPU times: user 31.8 s, sys: 19.5 ms, total: 31.8 s
Wall time: 5.47 s


UoI_Lasso(comm=None, copy_X=True, eps=0.001, estimation_frac=0.9,
     estimation_score='r2', fit_intercept=True, max_iter=None,
     n_boots_est=48, n_boots_sel=48, n_lambdas=48, normalize=True,
     random_state=<module 'numpy.random' from '/home/akumar/anaconda3/envs/nse/lib/python3.6/site-packages/numpy/random/__init__.py'>,
     selection_frac=0.9, stability_selection=1.0, warm_start=None)

In [7]:
selection_accuracy(beta.ravel(), uoil.coef_)

array([0.845])

In [None]:
## Linear interpolation is time intensive? Or perhap