In [9]:
from scipy import stats
import numpy as np
from scipy import optimize

import pandas as pd
import os.path

In [2]:
def tilde_theta_norm_upper_bound(Y, alpha=0.05):
    tilde_Y_norm = sum((Y-np.mean(Y))**2)
    df = Y.shape[0]-1
    if stats.chi2.cdf(tilde_Y_norm, df) < alpha: return 1e-5
    return optimize.newton(
        func=lambda lmbda: stats.ncx2(df, lmbda).cdf(tilde_Y_norm)-alpha,
        x0=max([1e-4,tilde_Y_norm-df]),
        tol=1e-4)

def b_bayes(Y, alpha, tau):
    upper = tilde_theta_norm_upper_bound(Y, (1-alpha)/2)
    tilde_Y_norm = sum((Y-np.mean(Y))**2)
    g = 1/(1+tau**2)
    N = len(Y)
    offset = - ((g/2)*upper + (g**2)*tilde_Y_norm  )
    ncChi2_quantile = stats.ncx2(
        N-1,
        (1./4.)*upper).ppf((1-alpha)/2)
    ncChi2_term = 2*g*ncChi2_quantile
    bya = offset + ncChi2_term
    return bya, upper

def c_value(Y, tau):
    bya = lambda alpha: b_bayes(Y, alpha, tau)[0]
    c_lower_bound =  1e-10
    if bya(c_lower_bound) < 0: return c_lower_bound
    if bya(1.-c_lower_bound) > 0: return 1.

    c_val = optimize.bisect(
        f=lambda alpha: bya(alpha),
        a=c_lower_bound, b=1.-c_lower_bound,xtol=1e-3)
    return c_val

def where_bya_breaks(win, Y, tau):
    bya = lambda alpha: b_bayes(Y, alpha, tau)[0]
    c_lower_bound =  1e-10
    if win>bya(c_lower_bound): return c_lower_bound
    if win<bya(1.-c_lower_bound): return 1.-c_lower_bound
    
    where_breaks = optimize.bisect(
        f=lambda alpha: bya(alpha)-win,
        a=c_lower_bound, b=1.-c_lower_bound, xtol=1e-3)
    return where_breaks
    
def W_lower_bound_with_norm(Y, tilde_theta_norm, tau, alpha=0.05):
    tilde_Y_norm = sum((Y-np.mean(Y))**2)
    g = 1/(1+tau**2)
    N = len(Y)
    
    offset = - ((g/2)*tilde_theta_norm + (g**2)*tilde_Y_norm  )
    ncChi2_quantile = stats.ncx2(
        N-1,
        (1./4.)*tilde_theta_norm).ppf((1-alpha)/2)
    ncChi2_term = 2*g*ncChi2_quantile
    bya = offset + ncChi2_term
    return bya

# Look at Risk Profile 

# Results to report for frequentist analysis

## In filename want:
* $\tau, N, \|P_1^\perp \theta\|$, 'frequentist_analysis',

## Here we want to save out, for each $\|P_1^\perp \theta\|^2$.
* rep, $\|P_1^\perp Y\|, \|\hat \theta - \theta\|^2, \|\theta^* - \theta\|^2, W, c(y)$


## Processed data (function  only of earlier files), one for each $\alpha$
* In filename want $\tau, N, \alpha$, 'frequentist_analysis_summary'
* $\|P_1^\perp \theta\|$, P_choose_bayes, P_choose_bayes_SE, Bayes_risk, Bayes_risk_SE, combined_risk, combined_risk_SE

In [4]:
def combined_bayes_estimator(N, theta_norm, tau, n_reps=50):
    """combined_bayes_estimator simulates data from model with fixed theta
    projected norm and returns the results of trials.
    
    Returns:
    Py_norms, mle_errs, bayes_errs, wins, cys 
    
    """
    Py_norms, c_values, wins, mle_errs, bayes_errs, by_breaks = [], [], [], [], [], []
    
    theta = np.random.normal(size=[N])
    theta *= theta_norm/np.linalg.norm(theta-np.mean(theta))
    for _ in range(n_reps):
        Y = theta +np.random.normal(size=theta.shape)
        tilde_Y_norm = sum((Y-np.mean(Y))**2)
        
        bayes = Y + (1/(1+tau**2))*(np.mean(Y)-Y)
        mle_err, bayes_err = np.sum((Y-theta)**2), np.sum((theta-bayes)**2)
        win, c_val = mle_err - bayes_err, c_value(Y, tau)
        
        # compute alpha below which bound fails to hold
        by_break = where_bya_breaks(win, Y, tau)
        
        Py_norms   += [tilde_Y_norm]
        mle_errs   += [mle_err]
        bayes_errs += [bayes_err]
        wins       += [win]
        c_values   += [c_val]
        by_breaks  += [by_break]
    return Py_norms, mle_errs, bayes_errs, wins, c_values, by_breaks

In [20]:
# Run simulations for different values of \| P_1^\perp \theta\|
from multiprocessing import Pool

base_fn = "../results/section3_simulation_study/"
def run_simulation(tau, N, theta_norm, n_reps):
    Py_norms, mle_errs, bayes_errs, wins, c_values, by_breaks = combined_bayes_estimator(
        N, theta_norm, tau, n_reps=n_reps)
    
    # Write out results of simulation
    fn_out = base_fn + "tau=%0.02f_N=%03d_P_theta_norm=%0.02f_frequentist_analysis.tsv"%(tau, N, theta_norm)
    assert not os.path.exists(fn_out)
    
    with open(fn_out, 'w') as f:
        f.write("\t".join(["Rep", "P_Y_norm", "MLE_Err", "Bayes_Err", "Win", "c_value", "by_break"]) + "\n")
        for rep, rep_vals in enumerate(zip(Py_norms, mle_errs, bayes_errs, wins, c_values, by_breaks)):
            l = "%04d\t"%rep + "\t".join(["%0.05f"%val for val in rep_vals])+"\n"
            f.write(l)
    print("complete : ", theta_norm)

N, tau, n_reps = 2, 1., 5000
theta_norm_min, theta_norm_max = 0, 1.5*np.sqrt(2*tau+2)*np.sqrt(N-1)
theta_norms = list(np.arange(theta_norm_min,theta_norm_max,(theta_norm_max-theta_norm_min)/20))

theta_norms = [np.sqrt((2*tau**2 + 1)*(N-1))]
print(theta_norms)

n_workers = 4
def f(theta_norm): run_simulation(tau, N, theta_norm, n_reps)
with Pool(n_workers) as p: p.map(f, theta_norms)

[1.7320508075688772]
complete :  1.7320508075688772


# Summarize results for combined estimator at different alphas

In [15]:
def frequentist_summary(mle_errs, bayes_errs, c_values, alpha):
    """frequentist_summary returns a summary of the combined estimator
    on the replicates provided for the given choice of alpha.
    """
    n_reps = len(mle_errs)
    chose_bayes = c_values>=alpha
    
    # estimate frequency at which combined estimator evaluates to the Bayes estimate.
    P_choose_bayes = np.mean(chose_bayes)
    P_choose_bayes_SE = np.sqrt(P_choose_bayes*(1-P_choose_bayes))/np.sqrt(n_reps)
    
    # estimate risk of bayes and combined estimators
    Bayes_risk = np.mean(bayes_errs)
    Bayes_risk_SE = np.std(bayes_errs)/np.sqrt(n_reps)
    
    combined_errs = mle_errs*(1-chose_bayes) + bayes_errs*chose_bayes
    combined_risk = np.mean(combined_errs)
    combined_risk_SE = np.std(combined_errs)/np.sqrt(n_reps)
    
    return P_choose_bayes, P_choose_bayes_SE, Bayes_risk, Bayes_risk_SE, combined_risk, combined_risk_SE

In [18]:
# Write out results of simulation
alphas = np.array([0.1,0.5, 0.75,0.8, 0.85, 0.9, 0.95, 0.975, 0.99, 0.995])
for alpha in alphas:
    fn_out = base_fn + "tau=%0.02f_N=%03d_alpha=%0.03f_frequentist_analysis_summary.tsv"%(tau, N, alpha)
    assert not os.path.exists(fn_out)
    with open(fn_out, 'w') as f:
        f.write("\t".join(["P_theta_norm", "P_choose_bayes", "P_choose_bayes_SE", "Bayes_risk", "Bayes_risk_SE",
                           "combined_risk", "combined_risk_SE"])+"\n")
        for theta_norm in theta_norms:
            fn_in = base_fn+"tau=%0.02f_N=%03d_P_theta_norm=%0.02f_frequentist_analysis.tsv"%(tau, N, theta_norm)
            df = pd.read_csv(fn_in, sep="\t")
            mle_errs, bayes_errs, c_values = df.MLE_Err, df.Bayes_Err, df.c_value
            summary_stats = list(frequentist_summary(mle_errs, bayes_errs, c_values, alpha))
            f.write("\t".join("%0.02f"%v for v in ([theta_norm]+summary_stats))+ "\n")