In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# %load ../../loaders/imports.py
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import time
import pdb

# Add the uoicorr directory to the path
sys.path.append('../../../uoicorr_run')

# Add the root directory of this repository
sys.path.append('../..')

from postprocess_utils import *
import pandas as pd
import sqlalchemy

In [3]:
from utils import gen_data, gen_covariance, sparsify_beta, gen_beta2

In [4]:
root_dir = '/mnt/sdb1'

In [5]:
# Read the non-concatenated dataframes to ensure indices are properly preserved
lasso = pd.read_pickle('%s/finalfinal/lasso_df.dat' % root_dir)
mcp = pd.read_pickle('%s/finalfinal/mcp_df.dat' % root_dir)
scad = pd.read_pickle('%s/finalfinal/scad_df.dat' % root_dir)
en = pd.read_pickle('%s/finalfinal/en_df.dat' % root_dir)

In [6]:
# remove the parasitic index field
lasso = lasso.drop('index', axis=1)
mcp = mcp.drop('index', axis=1)
scad = scad.drop('index', axis=1)
en = en.drop('index', axis=1)

In [27]:
lasso.set_index(np.arange(lasso.shape[0]), inplace=True)

In [30]:
idx1 = list(lasso.loc[lasso['cov_idx'] == 0].index)

In [5]:
import h5py

In [6]:
# Iterate through each dataframe and isolate the sets of reps and calculate the bias and variance of estimates
# 1) along the entire vector, 2) along the properly selected coefficients

In [7]:
import itertools

In [23]:
param_combos[0:10]

[(0.020000000000000004, -1.0, 'AIC', 0),
 (0.020000000000000004, -1.0, 'AIC', 1),
 (0.020000000000000004, -1.0, 'AIC', 2),
 (0.020000000000000004, -1.0, 'AIC', 3),
 (0.020000000000000004, -1.0, 'AIC', 4),
 (0.020000000000000004, -1.0, 'AIC', 5),
 (0.020000000000000004, -1.0, 'AIC', 6),
 (0.020000000000000004, -1.0, 'AIC', 7),
 (0.020000000000000004, -1.0, 'AIC', 8),
 (0.020000000000000004, -1.0, 'AIC', 9)]

In [47]:
dframes = [lasso, mcp, scad, en]
dframe_names = ['Lasso', 'MCP', 'SCAD', 'EN']
sparsity = np.unique(lasso['sparsity'].values)
betawidth = np.unique(lasso['betawidth'].values)
selection_methods = np.unique(lasso['selection_method'].values)
kappa = 5
np_ratio = 4
cov_idxs = np.arange(80)

beta_fnames = ['%s/finalfinal/%s_pp_beta.h5' % (root_dir, dfname) for dfname in ['lasso', 'mcp', 'scad', 'en']]
beta_files = [h5py.File(beta_fname, 'r') for beta_fname in beta_fnames]

param_combos = list(itertools.product(sparsity, betawidth, selection_methods, cov_idxs))
print(len(param_combos))

bias_variance_list = []

for i, dframe in enumerate(dframes):
    print(i)
    counter = 0
    t0 = time.time()
    for param_combo in param_combos:
        s, bw, sm, cidx = param_combo
        df = apply_df_filters(dframe, sparsity=s, betawidth=bw, 
                                  selection_method=sm, cov_idx=cidx, kappa=kappa, np_ratio=np_ratio)
        if df.shape[0] == 0:
            continue
        else:
            try:
                assert(df.shape[0] == 20)
            except:
                pdb.set_trace()
            pdb.set_trace()
            bias1, bias2, var = calc_bias_var(df, beta_files[i])
            bias_variance_list.append({'df_name': dframe_names[i], 'sparsity': s, 'betawidth' : bw,
                                      'selection_method' : sm, 'cov_idx': cidx, 'kappa': kappa,
                                      'np_ratio': np_ratio, 'total_bias': bias1, 'common_bias': bias2,
                                      'variance': var})            
            counter += 1
            if counter % 100 == 0:
                print('100 iteration time: %f' % (time.time() - t0))
                t0 = time.time()

28800
0
> <ipython-input-47-b98722c4dbe9>(34)<module>()
-> bias1, bias2, var = calc_bias_var(df, beta_files[i])
(Pdb) df.index
Int64Index([  808,  1960,  3112,  4264,  5416,  6568,  7720,  8872, 10024,
            11176, 12328, 13480, 14632, 15784, 16936, 18088, 19240, 20392,
            21544, 22696],
           dtype='int64')
(Pdb) continue
> <ipython-input-47-b98722c4dbe9>(33)<module>()
-> pdb.set_trace()
(Pdb) df.index
Int64Index([  808,  1960,  3112,  4264,  5416,  6568,  7720,  8872, 10024,
            11176, 12328, 13480, 14632, 15784, 16936, 18088, 19240, 20392,
            21544, 22696],
           dtype='int64')
(Pdb) param_combo
(0.020000000000000004, -1.0, 'AIC', 1)
(Pdb) quit()


BdbQuit: 

In [45]:
def calc_bias_var(df, beta_file):
    
    indices = list(df.index)
    # Take the indices
    beta = beta_file['beta'][indices, :]
    # Ensure all the betas are the same
    assert(np.isclose(beta, beta[0]).all())
    
    beta_hats = beta_file['beta_hat'][indices, :]
    
    # Total bias
    total_bias = np.mean(np.linalg.norm(beta - beta_hats, axis=1))
    
    common_support_bias = 0
    
    # Common support bias
    for i in range(len(indices)):
        common_support = list(set(np.nonzero(beta[i, :])[0]).intersection(set(np.nonzero(beta_hats[i, :])[0])))
        common_support_bias += 1/len(indices) * np.linalg.norm(beta[i, common_support] - beta_hats[i, common_support])
    variance = np.mean(np.var(beta_hats, axis = 0))

    return total_bias, common_support_bias, variance