In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# %load ../../loaders/imports.py
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import time
import pdb

# Add the uoicorr directory to the path
sys.path.append('../../../uoicorr_run')

# Add the root directory of this repository
sys.path.append('../..')

from postprocess_utils import *
import pandas as pd
import sqlalchemy

In [3]:
from job_utils.idxpckl import Indexed_Pickle

In [6]:
root_dir = '/media/akumar/Data/nse'

In [8]:
lasso = pd.read_pickle('%s/finalfinal/lasso_concat_df.dat' % root_dir)
mcp = pd.read_pickle('%s/finalfinal/mcp_concat_df.dat' % root_dir)
scad = pd.read_pickle('%s/finalfinal/scad_concat_df.dat' % root_dir)
en = pd.read_pickle('%s/finalfinal/en_concat_df.dat' % root_dir)
uoi = pd.read_pickle('%s/finalfinal/uoi_concat_df.dat' % root_dir)

In [9]:
from plotting_utils import *

In [15]:
from utils import gen_data, gen_covariance, sparsify_beta, gen_beta2

### Expand alpha datalist to encompass all SNR/N/P ratios

In [10]:
import pickle

In [11]:
# Load precalculated ss and eigenvalue bounds
with open('eigenvalue_ss.dat', 'rb') as f:
    eigenvalue_bounds = pickle.load(f)
    ss = pickle.load(f)

In [12]:
sparsity = np.unique(uoi['sparsity'].values)

In [13]:
from expanded_ensemble import load_covariance

In [16]:
# Pre-sparsifty the beta for inverse exponential distribution and save away, as this is a time sink
sparse_beta = []
cov_idxs = np.arange(120)
for k, s in enumerate(sparsity):            
    for cov_idx in cov_idxs:
        _, cov_param = load_covariance(cov_idx)
        # take the minimum non-zero beta value
        beta = gen_beta2(500, 500, 
                         1, -1, seed=1234, distribution='normal')        
        # Sparsify beta
        beta = sparsify_beta(beta, cov_param['block_size'], s,
                             seed = cov_param['block_size'])
        sparse_beta.append({'sparsity': s, 'cov_idx': cov_idx, 'beta': beta})

In [17]:
cov_params = [] 
for cov_idx in np.arange(120):
    _, cp = load_covariance(cov_idx)
    cov_params.append(cp)
    

In [18]:
sparse_beta = pd.DataFrame(sparse_beta)

In [19]:
# Eigenvalue constant
def calc_alpha_sa(cov_indices, df, rho, ss_, flag, threshold=1):
    t0 = time.time()
    alphas = np.zeros(len(cov_indices))    
    sa = np.zeros((len(cov_indices)))
    for i, cov_idx in enumerate(cov_indices):
        df_ = apply_df_filters(df, cov_idx=cov_idx)        
        cov_param = cov_params[cov_idx]

        # Use pregenerated beta for inverse exp due to slow time for rejection sampling
        if df_.iloc[0]['betawidth'] == -1:
            sb = apply_df_filters(sparse_beta, cov_idx=cov_idx, sparsity=df.iloc[0]['sparsity'])
            beta = sb.iloc[0]['beta']
        else:
            # take the minimum non-zero beta value
            beta = gen_beta2(df_.iloc[0]['n_features'], df_.iloc[0]['n_features'], 1, df_.iloc[0]['betawidth'], seed=1234, distribution='normal')                
            # Sparsify beta
            beta = sparsify_beta(beta[np.newaxis, :], cov_param['block_size'], df_.iloc[0]['sparsity'], seed = cov_param['block_size'])

        beta=beta.ravel()

        alphas[i] = np.mean(rho[i] * np.min(np.abs(beta[np.nonzero(beta)[0]]))/ss_[i])

        # Just return the average selection accuracy
        if flag is None:
            sa[i] = np.mean(df_['sa'].values)

        if flag == 'threshold':
            sa[i] = np.count_nonzero(1 * df_.iloc[cov_indices[i]]['sa'].values > threshold)/len(cov_indices[i])
           
    return alphas, sa

In [25]:
alpha_datalist_u


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79])

In [89]:
# Plot either the average selection accuracy or the percent of runs that exceed a certain threshold as a 
# function of the parameter alpha. alpha = rho(Omega) sum(beta_min^2)/sigma^2

# Calculate across signal to noise ratios, n/p ratio = 4
kappa = np.unique(lasso['kappa'].values)
np_ratios = np.unique(lasso['np_ratio'].values)
betawidth = np.unique(lasso['betawidth'].values)
selection_methods = np.unique(lasso['selection_method'])
dframes = [uoi, lasso, mcp, scad, en]
#dframes = [uoi, en]
dframe_names = ['UoI Lasso', 'Lasso', 'MCP', 'SCAD', 'EN']
# Need to create a dataframe 
alpha_datalist_uoi = []

for i, dframe in enumerate(dframes):                
    for j, bw in enumerate(betawidth):
        t0 = time.time()
        for h, sm in enumerate(selection_methods):
            for k, s in enumerate(sparsity):            
                for ii, kappa_ in enumerate(kappa):
                    for jj, np_ratio_ in enumerate(np_ratios):
                        df = apply_df_filters(dframe, kappa=kappa_, np_ratio=np_ratio_, selection_method=sm,
                                              betawidth=bw, sparsity=s)
                        cov_indices = np.unique(df['cov_idx'].values) 
                        alpha_, sa_ = calc_alpha_sa(cov_indices, df, rho=eigenvalue_bounds[:, k],
                                                    ss_=ss[:, k, j], flag=None) 
                        alpha_datalist_uoi.append({'df_name' : dframe_names[i], 'betawidth': bw, 'sparsity' : s,
                                               'alpha': alpha_, 'sa': sa_, 'kappa' : kappa_, 
                                               'selection_method': sm, 'np_ratio': np_ratio_,
                                               'cov_indices': cov_indices})       
        print('Dataframe: %s, bw_idx: %d, time: %f' % (dframe_names[i], j, time.time() - t0))

Dataframe: UoI Lasso, bw_idx: 0, time: 811.305557
Dataframe: UoI Lasso, bw_idx: 1, time: 758.539385
Dataframe: UoI Lasso, bw_idx: 2, time: 757.437975
Dataframe: Lasso, bw_idx: 0, time: 1148.994747
Dataframe: Lasso, bw_idx: 1, time: 1048.645408
Dataframe: Lasso, bw_idx: 2, time: 1047.702638
Dataframe: MCP, bw_idx: 0, time: 1114.218609
Dataframe: MCP, bw_idx: 1, time: 1023.190513
Dataframe: MCP, bw_idx: 2, time: 1018.283912
Dataframe: SCAD, bw_idx: 0, time: 1089.721250
Dataframe: SCAD, bw_idx: 1, time: 1021.049241
Dataframe: SCAD, bw_idx: 2, time: 1017.992569
Dataframe: EN, bw_idx: 0, time: 1093.112985
Dataframe: EN, bw_idx: 1, time: 1025.993290
Dataframe: EN, bw_idx: 2, time: 1021.290996


In [23]:
# Save 
import pickle
with open('alpha_datalist_expanded.dat', 'wb') as f:
    f.write(pickle.dumps(alpha_datalist_uoi))

In [26]:
alpha_datalist = pd.DataFrame(alpha_datalist_uoi)

In [36]:
uoi_ = apply_df_filters(uoi, betawidth=-1, sparsity=alpha_datalist.iloc[0]['sparsity'], kappa=1, selection_method='AIC',
                        np_ratio=2)

In [35]:
alpha_datalist.iloc[0]

df_name                                                     UoI Lasso
betawidth                                                          -1
sparsity                                                         0.02
alpha               [0.00037392887190799127, 0.00793968761035703, ...
sa                  [0.10931532289733101, 0.17024821340990917, 0.1...
kappa                                                               1
selection_method                                                  AIC
np_ratio                                                            2
Name: 0, dtype: object

In [None]:
# Adopt the following scheme to simplify the metric presented:

# Across model densities:
# (1) Divide into 3 equal correlation strengths (alpha)
# (2) Divide into "high noise", "low data", "high noise, low data", "ideal" conditions
# (3) Keep betawidth together
# (4) Calculate E(FNR), E(FPR) and variance for each combination of selection method/algorithm

In [39]:
np_ratios

array([ 2,  4,  8, 16])

In [46]:
dframes = [uoi, lasso, mcp, en, scad]

In [87]:
def calc_(case_alpha_df, case_params):
    
    results_list =[]    
    
    # Narrow down dframes to case params
    case_dframes = [apply_df_filters(df, **case_params) for df in dframes]
    
    dframe_names = ['UoI Lasso', 'Lasso', 'MCP', 'EN', 'SCAD']
    selection_methods = ['AIC', 'BIC', 'CV', 'gMDL', 'empirical_bayes', 'oracle']
    
    for i, dframe in enumerate(case_dframes):
        for sm in selection_methods:
            # At this stage, only betawidths should be residual
            cadf = apply_df_filters(case_alpha_df, df_name=dframe_names[i], selection_method=sm)
            df_ = apply_df_filters(dframe, selection_method=sm)
            
            try:
                assert(cadf.shape[0] == 3)
            except:
                pdb.set_trace()
            alpha_sizes = [cadf.iloc[ii]['alpha'].size for ii in range(3)]
            assert(alpha_sizes[0] == alpha_sizes[1] and alpha_sizes[1] == alpha_sizes[2])
            
            # If so, just grab the alphas from the first betawidth in the 
            alphas = case_alpha_df.iloc[0]['alpha'].ravel()
            
            try:
                # Make sure that the number of unique cov_idxs match the number of alphas
                assert(np.unique(df_['cov_idx'].values).size == alphas.size)
            except:
                pdb.set_trace()
            # mask the alpha indices
            alpha_mask = np.ma.log(alphas)
            mask = np.invert(alpha_mask.mask)
            
            alpha_ordering= np.argsort(np.log(alphas[mask]))
            
            # group cov_idxs appropriately into 3 groups
            cov_idx_groups = np.array_split(np.arange(alphas.size)[mask][alpha_ordering], 3)

            for k, cig in enumerate(cov_idx_groups):
                
                df_cig = df_.loc[df_['cov_idx'].isin(cig)]
                # Take the mean and variance of the FPR and FNR
                EFPR = np.mean(df_cig['FPR'].values)
                EFNR = np.mean(df_cig['FNR'].values)
                stdFPR = np.std(df_cig['FPR'].values)
                stdFNR = np.std(df_cig['FNR'].values)

                
                # Append the 
                results = {'df_name': dframe_names[i], 'selection_method': sm,
                           'EFPR': EFPR, 'EFNR': EFNR, 'stdFPR': stdFPR, 'stdFNR': stdFNR,
                           'cidx_group': k, 'cov_indices': cig}
                results.update(case_params)
                
                results_list.append(results)
    return results_list

In [88]:
e_fpr_fnr = []
for i, s in enumerate(sparsity):
    
    adl = apply_df_filters(alpha_datalist, sparsity=s)
    
    # High noise, normal n/p ratio
    case1 = apply_df_filters(adl, kappa=1, np_ratio=4)

    e_fpr_fnr.append(calc_(case1, {'kappa' : 1, 'np_ratio': 4, 'sparsity': s}))
    
    # Normal noise, low data
    case2 = apply_df_filters(adl, kappa=5, np_ratio=2)
    
    e_fpr_fnr.append(calc_(case2, {'kappa' : 5, 'np_ratio': 2, 'sparsity': s}))
                     
    # ideal
    case3 = apply_df_filters(adl, kappa=10, np_ratio=16)
    
    e_fpr_fnr.append(calc_(case3, {'kappa' : 10, 'np_ratio': 16, 'sparsity': s}))


> <ipython-input-87-8e02f7137db7>(33)calc_()
-> alpha_mask = np.ma.log(alphas)
(Pdb) cadf.shape
(3, 8)
(Pdb) np.unique(df_['cov_idx'].values).size
80
(Pdb) alphas.size
79
(Pdb) df_['cov_idx'].values
array([54, 54, 54, ..., 57, 57, 57])
(Pdb) np.unique(df_['cov_idx'].values)
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79])
--KeyboardInterrupt--
(Pdb) quit()


BdbQuit: 

In [83]:
len(e_fpr_fnr)

45

In [84]:
len(e_fpr_fnr[0])

18

In [85]:
45 * 17

765

In [86]:
45 ( 18})

SyntaxError: invalid syntax (<ipython-input-86-4250ac754c83>, line 1)