In [184]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [185]:
log_dir = "./log/binary_data_simulation/kfold1/20200224_144552_kfoldfexp_again_sim1_m2//"
if log_dir[-1] != "/":
    print("\n\nAppending `/`-character at the end of directory")
    log_dir = log_dir+ "/"
data = load_obj("data", log_dir)

In [186]:
print("\n\nChecking data integrity...\n\n")
complete_data = load_obj("complete_data", log_dir)

model_posterior_samples = dict()
model_posterior_samples[0] = load_obj('ps_0', log_dir)
model_posterior_samples[1] = load_obj('ps_1', log_dir)
model_posterior_samples[2] = load_obj('ps_2', log_dir)




Checking data integrity...




In [187]:
data

{'random_seed': 0,
 'N': 1000,
 'K': 2,
 'J': 6,
 'alpha': array([0., 0., 0., 0., 0., 0.]),
 'beta': array([[1. , 0. ],
        [0.8, 0. ],
        [0.8, 0. ],
        [0. , 1. ],
        [0. , 0.8],
        [0. , 0.8]]),
 'sigma_z': array([1., 1.]),
 'Phi_corr': array([[1. , 0.2],
        [0.2, 1. ]]),
 'Phi_cov': array([[1. , 0.2],
        [0.2, 1. ]]),
 'z': array([[-1.61951071, -1.11334743],
        [-2.17539248,  0.65913812],
        [-0.8285194 , -2.064689  ],
        ...,
        [-0.21506008, -0.091414  ],
        [-1.18581773, -0.98541301],
        [ 1.7136435 ,  0.05538257]]),
 'y': array([[-4.71120685, -0.30514589,  0.10180765,  2.30242585, -0.03971164,
          0.03342802],
        [-2.95078751, -0.91775037, -2.69966356,  1.06164041,  2.70867847,
          1.67174146],
        [ 0.29454048, -0.14724192, -1.68185281, -2.17771467, -3.31034468,
         -2.40248024],
        ...,
        [ 0.10786283, -0.88279962,  3.37834395,  2.26769607, -0.33595897,
          1.72253444],


In [188]:
model_posterior_samples[0].keys()

dict_keys(['alpha', 'yy', 'beta', 'Marg_cov', 'Omega_cov', 'Phi_cov'])

In [189]:
num_chains = 1
num_samples = model_posterior_samples[0]['alpha'].shape[0]
num_warmup = model_posterior_samples[0]['alpha'].shape[0]
num_iter = num_samples + num_warmup

In [190]:
for j in range(3):
    ps = model_posterior_samples[j]    
    for i in range(num_samples):
        sign1 = np.sign(ps['beta'][i,0,0])
        sign2 = np.sign(ps['beta'][i,3,1])
        ps['beta'][i,:3,0] = ps['beta'][i,:3,0] * sign1
        ps['beta'][i,3:,1] = ps['beta'][i,3:,1] * sign2

        ps['Phi_cov'][i,0,1] = sign1 * sign2 * ps['Phi_cov'][i,0,1]
        ps['Phi_cov'][i,1,0] = ps['Phi_cov'][i,0,1]
    model_posterior_samples[j] = ps

In [191]:
np.round(np.mean(model_posterior_samples[0]['beta'],0),2)

array([[ 0.79, -0.  ],
       [ 0.82,  0.01],
       [ 1.34, -0.  ],
       [ 0.  ,  1.29],
       [ 0.02,  1.28],
       [-0.01,  2.02]])

In [192]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  cov_matrix[J] I_J = diag_matrix(rep_vector(1, J));
}

parameters {
  vector[J] alpha;
  matrix[3,K] beta_free; // 3 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  matrix[N,J] yy;
  cov_matrix[J] Omega_cov;
  cholesky_factor_corr[K] L_Phi;

}

transformed parameters{
  matrix[J,K] beta;
  cov_matrix[J] Marg_cov;
  corr_matrix[K] Phi_cov;
  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  // set the free elements
  for (k in 1:K) beta[1+3*(k-1) : 3+3*(k-1), k] = beta_free[1:3,k];
  // set the zero elements
  beta[4:J, 1] = beta_zeros[1:(J-3), 1];
  beta[1:(J-3), K] = beta_zeros[1:(J-3), K];
  
  Phi_cov = multiply_lower_tri_self_transpose(L_Phi);
  Marg_cov = beta * Phi_cov * beta'+ Omega_cov;
}

model {
  to_vector(beta_free) ~ normal(0, 1);
  to_vector(beta_zeros) ~ normal(0, 0.1);
  to_vector(alpha) ~ normal(0, 10);
  L_Phi

In [193]:
# %%opts Layout [fig_size=200]
# fold_index = 0
# test = 'test'
# plots = []
# for j in range(data['J']):
#     for k in range(data['K']):
#         plots.append(plot_trace(model_posterior_samples[fold_index]['beta'][:,j,k],
#                                 true_value=data['beta'][j,k],
#                                 title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
#                      options(fig_inches=8, aspect=3))
# layout = hv.Layout(plots).options(show_title = True,
#                                   vspace = .3,
#                                   absolute_scaling=False,
#                                   normalize=False) # use same y-range for all plots?

# layout.cols(2)


In [194]:
# def get_prob_pred(data, ps, m):
#     N = data['N']
#     pistr = np.empty((N, data['J']))
    
#     if 'zz' in ps.keys():
#         z_mc = multivariate_normal.rvs(np.zeros(data['K']),
#             ps['Phi_cov'][m], size = N)
#         ystr = ps['alpha'][m] + z_mc @ ps['beta'][m].T
#     elif 'Marg_cov' in ps.keys():
#         ystr = multivariate_normal.rvs(ps['alpha'][m],
#             ps['Marg_cov'][m], size = N)

#     # logit
#     pistr = expit(ystr)

#     # probit
#     # pistr = norm.cdf(ystr)
#     return pistr


def get_prob_pred(data, ps, m):
    L = 100
    if 'zz' in ps.keys():
        assert check_posdef(ps['Phi_cov'][m])==0
        z_mc = multivariate_normal.rvs(np.zeros(data['K']),
            ps['Phi_cov'][m], size = L)
        ystr = np.empty((L, data['J']))
        for l in range(L):
            ystr[l] = ps['alpha'][m] + z_mc[l] @ ps['beta'][m].T
    elif 'Marg_cov' in ps.keys():
        assert check_posdef(ps['Marg_cov'][m])==0
        ystr = multivariate_normal.rvs(ps['alpha'][m],
            ps['Marg_cov'][m], size = L)

    # logit
    pistr = expit(ystr)

    # probit
    # pistr = norm.cdf(ystr)
    return pistr




# def get_prob_pred(data, ps, m):
#     N = data['N']
#     if 'zz' in ps.keys():
#         ystr = ps['alpha'][m] + ps['zz'][m] @ ps['beta'][m].T
#     elif 'Marg_cov' in ps.keys():
#         ystr = ps['yy'][m]

#     # logit
#     pistr = expit(ystr)

#     # probit
#     # pistr = norm.cdf(ystr)
#     return pistr
        
  
def get_avg_probs(data, ps, m):
    avg_prob = get_prob_pred(data, ps, m)
    return np.mean(avg_prob, 0)
        
#     lgscr = -data['DD']*np.log(piavg) - (1-data['DD'])*np.log(1-piavg)

#     return np.sum(lgscr)

        
# def get_avg_probs(data, ps, m, L = 10):
#     avg_prob = np.empty((L, data['N'], data['J']))
#     for l in range(L):
#         avg_prob[l] = get_prob_pred(data, ps, m)
#     return np.mean(avg_prob, 0)
        
# #     lgscr = -data['DD']*np.log(piavg) - (1-data['DD'])*np.log(1-piavg)

# #     return np.sum(lgscr)



def get_lgscr(data, ps):
    nsim_N = ps['alpha'].shape[0]
    lgscrs = np.empty(nsim_N)
    for m_ind in tqdm(range(nsim_N)):
        piavg = get_avg_probs(data, ps, m=m_ind )   
        lgscr =-data['DD']*np.log(piavg) - (1-data['DD'])*np.log(1-piavg)
        lgscrs[m_ind] = np.sum(lgscr)

    return np.sum(lgscrs)/nsim_N    


In [195]:

print("\n\nLoading files...\n\n")


print("\n\nChecking data integrity...\n\n")

mcmc_length = model_posterior_samples[0]['alpha'].shape[0]

Ds = np.empty(3)
for fold_index in range(3):
    Ds[fold_index] = get_lgscr(complete_data[fold_index]['test'], model_posterior_samples[fold_index])

Ds



Loading files...




Checking data integrity...




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




array([1393.42471213, 1391.14359152, 1389.95566611])

In [196]:
np.round(np.mean(Ds),2)

1391.51

In [197]:
#m1
np.round(np.mean(Ds),2)

1391.51