In [88]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
log_dir = "./log/binary_data_simulation/kfold2/20200223_030706_kfoldfexp_sim2_m2///"
if log_dir[-1] != "/":
    print("\n\nAppending `/`-character at the end of directory")
    log_dir = log_dir+ "/"
data = load_obj("data", log_dir)

In [90]:
print("\n\nChecking data integrity...\n\n")
complete_data = load_obj("complete_data", log_dir)

model_posterior_samples = dict()
model_posterior_samples[0] = load_obj('ps_0', log_dir)
model_posterior_samples[1] = load_obj('ps_1', log_dir)
model_posterior_samples[2] = load_obj('ps_2', log_dir)




Checking data integrity...




In [91]:
data

{'random_seed': 0,
 'N': 1000,
 'K': 2,
 'J': 6,
 'alpha': array([0., 0., 0., 0., 0., 0.]),
 'beta': array([[1. , 0. ],
        [0.8, 0. ],
        [0.8, 0.8],
        [0. , 1. ],
        [0.8, 0.8],
        [0. , 0.8]]),
 'sigma_z': array([1., 1.]),
 'Phi_corr': array([[1. , 0.2],
        [0.2, 1. ]]),
 'Phi_cov': array([[1. , 0.2],
        [0.2, 1. ]]),
 'z': array([[-1.61951071, -1.11334743],
        [-2.17539248,  0.65913812],
        [-0.8285194 , -2.064689  ],
        ...,
        [-0.21506008, -0.091414  ],
        [-1.18581773, -0.98541301],
        [ 1.7136435 ,  0.05538257]]),
 'y': array([[-4.32504373, -4.38730471, -2.11265848, -2.70652732, -2.31528154,
         -2.03919375],
        [-3.43862813, -2.51570901, -0.06495358,  2.20002122, -0.61698229,
         -1.12253058],
        [ 0.21793111,  0.46024436, -1.87478779, -3.04390179, -1.15752816,
         -0.97153843],
        ...,
        [-2.33686579,  0.15087484, -0.73448664, -2.12297416, -3.04573916,
          1.32768299],


In [92]:
model_posterior_samples[0].keys()

dict_keys(['alpha', 'yy', 'beta', 'Marg_cov', 'Omega_cov', 'Phi_cov'])

In [93]:
num_chains = 1
num_samples = model_posterior_samples[0]['alpha'].shape[0]
num_warmup = model_posterior_samples[0]['alpha'].shape[0]
num_iter = num_samples + num_warmup

In [94]:
for j in range(3):
    ps = model_posterior_samples[j]    
    for i in range(num_samples):
        sign1 = np.sign(ps['beta'][i,0,0])
        sign2 = np.sign(ps['beta'][i,3,1])
        ps['beta'][i,:3,0] = ps['beta'][i,:3,0] * sign1
        ps['beta'][i,3:,1] = ps['beta'][i,3:,1] * sign2

        ps['Phi_cov'][i,0,1] = sign1 * sign2 * ps['Phi_cov'][i,0,1]
        ps['Phi_cov'][i,1,0] = ps['Phi_cov'][i,0,1]
    model_posterior_samples[j] = ps

In [95]:
np.round(np.mean(model_posterior_samples[0]['beta'],0),2)

array([[ 0.66, -0.03],
       [ 0.41,  0.03],
       [ 1.45, -0.04],
       [ 0.  ,  0.95],
       [-0.04,  1.36],
       [ 0.  ,  0.77]])

In [96]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  cov_matrix[J] I_J = diag_matrix(rep_vector(1, J));
}

parameters {
  vector[J] alpha;
  matrix[3,K] beta_free; // 3 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  matrix[N,J] yy;
  cov_matrix[J] Omega_cov;
  cholesky_factor_corr[K] L_Phi;

}

transformed parameters{
  matrix[J,K] beta;
  cov_matrix[J] Marg_cov;
  corr_matrix[K] Phi_cov;
  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  // set the free elements
  for (k in 1:K) beta[1+3*(k-1) : 3+3*(k-1), k] = beta_free[1:3,k];
  // set the zero elements
  beta[4:J, 1] = beta_zeros[1:(J-3), 1];
  beta[1:(J-3), K] = beta_zeros[1:(J-3), K];
  
  Phi_cov = multiply_lower_tri_self_transpose(L_Phi);
  Marg_cov = beta * Phi_cov * beta'+ Omega_cov;
}

model {
  to_vector(beta_free) ~ normal(0, 1);
  to_vector(beta_zeros) ~ normal(0, 0.1);
  to_vector(alpha) ~ normal(0, 10);
  L_Phi

In [97]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal, bernoulli, norm
import datetime
import sys
import os
import itertools

from codebase.file_utils import save_obj, load_obj
from scipy.special import expit
from scipy.special import ndtri

def to_str_pattern(y0):
    if np.ndim(y0) == 1:
        return ''.join(y0.astype(str))
    if np.ndim(y0) == 2:
        y = pd.DataFrame(y0)
        yresp = y.apply(lambda x: ''.join(x.astype(str)), axis=1)
        return yresp


def to_nparray_data(yresp):
    if type(yresp) == str:
        return np.array(list(yresp)).astype(int)
    else:
        J = len(yresp[0])
        N = yresp.shape[0]
        res = np.empty((N,J))
        for i in range(N):
            res[i] = np.array(list(yresp[i])).astype(int)
        return res


def get_all_possible_patterns(n):
    lst = list(map(list, itertools.product([0, 1], repeat=n)))
    return to_str_pattern(lst)


def get_exp_probs(data, ps, m , L=100):
    ## compute the pi's for the the m-th posterior sample
    if 'zz' in ps.keys():
        z_mc = multivariate_normal.rvs(np.zeros(data['K']),
            ps['Phi_cov'][m], size = L)
        ystr = np.empty((L, data['J']))
        for l in range(L):
            ystr[l] = ps['alpha'][m] + z_mc[l] @ ps['beta'][m].T
    elif 'Marg_cov' in ps.keys():
        ystr = multivariate_normal.rvs(ps['alpha'][m],
            ps['Marg_cov'][m], size = L)
    else:
        print("No matching model")

    # logit
    pistr = expit(ystr)

    # probit
    # pistr = norm.cdf(ystr)

    return pistr


def get_Ey(data_ptrn, prob, N):
    distinct_patterns = np.unique(data_ptrn)
    ## compute E_y(theta) for a specific pattern y
    Ey = dict()
    for ptrn in distinct_patterns:
        prob_matrix = bernoulli.logpmf(k=to_nparray_data(ptrn), p = prob)
        Ey[ptrn] = N * np.mean(np.exp(np.sum(prob_matrix,1)),0)
    return Ey


def get_Oy(data_ptrn):
    distinct_patterns = np.unique(data_ptrn)
    # compute observed pattern occurences
    Oy = dict()
    for ptrn in distinct_patterns:
        Oy[ptrn] = np.count_nonzero(data_ptrn == ptrn)
    return Oy


def get_Dy(Oy, Ey, data_ptrn):
    distinct_patterns = np.unique(data_ptrn)
    # compute the discrepancy D
    Dy = dict()
    for ptrn in distinct_patterns:
        Dy[ptrn] = Oy[ptrn] * np.log(Oy[ptrn]/Ey[ptrn])

    return Dy


def get_PPP(data, ps, nsim = 100, L=100):

    nsim_N = ps['alpha'].shape[0]
    skip_step = int(nsim_N/nsim)

    PPP_vals = np.empty(nsim)
    for m_ind in tqdm(range(nsim)):
        m = skip_step*m_ind
        # compute Dy
        pi =  get_exp_probs(data, ps, m, L)
        data_ptrn = to_str_pattern(data['DD'])
        all_possible_patterns = get_all_possible_patterns(data['J'])
        Oy = get_Oy(data_ptrn)
        Ey = get_Ey(data_ptrn, pi, data['N'])
        Dy = get_Dy(Oy, Ey, data_ptrn)

        # complete any missing patterns with 0's
    #     new_patterns = set(all_possible_patterns) - set(data_ptrn)
    #     if new_patterns == set():
    #         pass
    # #         print('no new patterns')
    #     else:
    #         for ptrn in new_patterns:
    #             Oy[ptrn] = 0.
    #             Dy[ptrn] = 0.


        PPP_vals[m_ind] = sum(Dy.values())

    return PPP_vals, Dy



In [98]:
nsim = 100
Ds = np.empty(3)
for fold_index in range(3):
    PPP_vals, Dy = get_PPP(complete_data[fold_index]['test'], model_posterior_samples[fold_index], nsim)
    Ds[fold_index] = np.mean(PPP_vals)

np.sum(Ds)


HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))




121.03917654456305

In [99]:
# sim 0
#m1 = 107.85480268402688
#m2 =  111.

In [100]:
# sim1
# #m1 = 156.22994039411827
# #m2 = 148.84531366870453

In [101]:
# sim2
# #m1 = 121.79978659604676
# #m2 = 121.03917654456305