In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# %load ../../loaders/imports.py
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import time
import pdb

# Add the uoicorr directory to the path
sys.path.append('../../../uoicorr_run')

# Add the root directory of this repository
sys.path.append('../..')

from postprocess_utils import *
import pandas as pd
import sqlalchemy

In [4]:
from utils import gen_data, gen_covariance, sparsify_beta, gen_beta2

In [17]:
import h5py, itertools

In [8]:
root_dir = '/media/akumar/Data/nse'

In [9]:
# Read the non-concatenated dataframes to ensure indices are properly preserved
lasso = pd.read_pickle('%s/finalfinal/lasso_df.dat' % root_dir)
mcp = pd.read_pickle('%s/finalfinal/mcp_df.dat' % root_dir)
scad = pd.read_pickle('%s/finalfinal/scad_df.dat' % root_dir)
en = pd.read_pickle('%s/finalfinal/en_df.dat' % root_dir)

In [10]:
# remove the parasitic index field
lasso = lasso.drop('index', axis=1)
mcp = mcp.drop('index', axis=1)
scad = scad.drop('index', axis=1)
en = en.drop('index', axis=1)

In [11]:
lasso.set_index(np.arange(lasso.shape[0]), inplace=True)
mcp.set_index(np.arange(mcp.shape[0]), inplace=True)
scad.set_index(np.arange(scad.shape[0]), inplace=True)
en.set_index(np.arange(en.shape[0]), inplace=True)

In [18]:
dframes = [lasso, mcp, scad, en]
dframe_names = ['Lasso', 'MCP', 'SCAD', 'EN']
sparsity = np.unique(lasso['sparsity'].values)
betawidth = np.unique(lasso['betawidth'].values)
selection_methods = np.unique(lasso['selection_method'].values)
kappa = 5
np_ratio = 4
cov_idxs = np.arange(80)

beta_fnames = ['%s/finalfinal/%s_pp_beta.h5' % (root_dir, dfname) for dfname in ['lasso', 'mcp', 'scad', 'en']]
beta_files = [h5py.File(beta_fname, 'r') for beta_fname in beta_fnames]

param_combos = list(itertools.product(sparsity, betawidth, selection_methods, cov_idxs))
print(len(param_combos))

bias_variance_list = []

for i, dframe in enumerate(dframes):
    print(i)
    counter = 0
    t0 = time.time()
    for param_combo in param_combos:
        s, bw, sm, cidx = param_combo
        df = apply_df_filters(dframe, sparsity=s, betawidth=bw, 
                                  selection_method=sm, cov_idx=cidx, kappa=kappa, np_ratio=np_ratio)
        if df.shape[0] == 0:
            continue
        else:
            try:
                assert(df.shape[0] == 20)
            except:
                pdb.set_trace()
            bias1, bias2, var = calc_bias_var(df, beta_files[i])
            bias_variance_list.append({'df_name': dframe_names[i], 'sparsity': s, 'betawidth' : bw,
                                      'selection_method' : sm, 'cov_idx': cidx, 'kappa': kappa,
                                      'np_ratio': np_ratio, 'total_bias': bias1, 'common_bias': bias2,
                                      'variance': var})            
            counter += 1
            if counter % 100 == 0:
                print('100 iteration time: %f' % (time.time() - t0))
                t0 = time.time()

28800
0
100 iteration time: 114.669367
> <ipython-input-18-d2159ce71d13>(22)<module>()
-> for param_combo in param_combos:
(Pdb) bias_variance_list[0]
{'df_name': 'Lasso', 'sparsity': 0.020000000000000004, 'betawidth': -1.0, 'selection_method': 'AIC', 'cov_idx': 0, 'kappa': 5, 'np_ratio': 4, 'total_bias': 12.857364, 'common_bias': 9.889462375640868, 'variance': 0.25116473}
(Pdb) bias_variance_list[1]
{'df_name': 'Lasso', 'sparsity': 0.020000000000000004, 'betawidth': -1.0, 'selection_method': 'AIC', 'cov_idx': 1, 'kappa': 5, 'np_ratio': 4, 'total_bias': 12.683029, 'common_bias': 9.580389595031738, 'variance': 0.25096348}
(Pdb) bias_variance[5]
*** NameError: name 'bias_variance' is not defined
(Pdb) bias_variance_list[85]
{'df_name': 'Lasso', 'sparsity': 0.020000000000000004, 'betawidth': -1.0, 'selection_method': 'AIC_ols', 'cov_idx': 44, 'kappa': 5, 'np_ratio': 4, 'total_bias': 23.276443, 'common_bias': 18.778165531158447, 'variance': 0.5550497}
(Pdb) quit()


BdbQuit: 

In [12]:
def calc_bias_var(df, beta_file):
    
    indices = list(df.index)
    # Take the indices
    beta = beta_file['beta'][indices, :]
    # Ensure all the betas are the same
    assert(np.isclose(beta, beta[0]).all())
    
    beta_hats = beta_file['beta_hat'][indices, :]
    
    # Total bias
    total_bias = np.mean(np.linalg.norm(beta - beta_hats, axis=1))
    
    common_support_bias = 0
    
    # Common support bias
    for i in range(len(indices)):
        common_support = list(set(np.nonzero(beta[i, :])[0]).intersection(set(np.nonzero(beta_hats[i, :])[0])))
        common_support_bias += 1/len(indices) * np.linalg.norm(beta[i, common_support] - beta_hats[i, common_support])
    variance = np.mean(np.var(beta_hats, axis = 0))

    return total_bias, common_support_bias, variance