In [30]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

from modelresultsbinary import *
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
log_dir = "./log/binary_data_simulation/sim0/m1/20200306_143736_4ch_s0m1/"
# log_dir = "./log/20200305_143550_test_s1m2_2/"
data = load_obj('data', log_dir)
ps = load_obj('ps', log_dir)
ps.keys()


odict_keys(['beta', 'alpha', 'zz', 'Phi_cov', 'yy'])

In [32]:
data

{'random_seed': 0,
 'N': 2000,
 'K': 2,
 'J': 6,
 'alpha': array([0., 0., 0., 0., 0., 0.]),
 'beta': array([[1. , 0. ],
        [0.8, 0. ],
        [0.8, 0. ],
        [0. , 1. ],
        [0. , 0.8],
        [0. , 0.8]]),
 'sigma_z': array([1., 1.]),
 'Phi_corr': array([[1. , 0.2],
        [0.2, 1. ]]),
 'Phi_cov': array([[1. , 0.2],
        [0.2, 1. ]]),
 'z': array([[-1.61951071, -1.11334743],
        [-2.17539248,  0.65913812],
        [-0.8285194 , -2.064689  ],
        ...,
        [ 0.0127624 , -0.6246135 ],
        [-1.35910483, -1.39777625],
        [-0.70754609, -1.73566846]]),
 'y': array([[-0.62398864,  2.09426518, -0.86156227,  0.8064457 , -4.04106126,
          0.26342   ],
        [-2.12690119, -0.48704737, -0.30832403, -3.74932225, -0.18691918,
         -0.5637349 ],
        [-0.58544445, -4.27993112, -2.49696605, -0.39293861, -1.36906947,
         -3.4812806 ],
        ...,
        [ 0.84067238,  0.09329643, -3.0005377 , -1.36119658, -3.01358029,
          0.02617155],


In [33]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  vector[K] zeros_K = rep_vector(0, K);
}

parameters {
  vector[J] alpha;
  matrix[3,K] beta_free; // 3 free eleements per factor
  cholesky_factor_corr[K] L_Phi;
  matrix[N,K] zz;
}

transformed parameters{
  matrix[J,K] beta;
  matrix[N,J] yy;

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  // set the free elements
  for (k in 1:K) beta[1+3*(k-1) : 3+3*(k-1), k] = beta_free[1:3,k];

  for (n in 1:N) yy[n,] = to_row_vector(alpha) + zz[n,] * beta';
}
  
model {
  to_vector(beta_free) ~ normal(0, 1);
  to_vector(alpha) ~ normal(0, 10);
  L_Phi ~ lkj_corr_cholesky(2);
  for (n in 1:N) to_vector(zz[n,])  ~ multi_normal_cholesky(zeros_K, L_Phi);
  for (j in 1:J) DD[, j] ~ bernoulli_logit(yy[, j]);
  
}

generated quantities{
  corr_matrix[K] Phi_cov = multiply_lower_tri_self_transpose(L_Phi);
}



In [34]:
ps['alpha'].shape

(2000, 4, 6)

In [35]:
np.mean(ps['yy'][0],0).shape

(2000, 6)

In [36]:
num_chains = 4
num_samples = ps['alpha'].shape[0]
num_warmup = ps['alpha'].shape[0]
num_iter = num_samples + num_warmup

In [37]:
for chain_number in range(num_chains):
    for i in range(num_samples):
        sign1 = np.sign(ps['beta'][i,chain_number,0,0])
        sign2 = np.sign(ps['beta'][i,chain_number,3,1])
        ps['beta'][i,chain_number,:3,0] = ps['beta'][i,chain_number,:3,0] * sign1
        ps['beta'][i,chain_number,3:,1] = ps['beta'][i,chain_number,3:,1] * sign2

        ps['Phi_cov'][i,chain_number,0,1] = sign1 * sign2 * ps['Phi_cov'][i,chain_number,0,1]
        ps['Phi_cov'][i,chain_number,1,0] = ps['Phi_cov'][i,chain_number,0,1]


In [38]:
# nsim_N = 100
# data['K'] = 2
# PPP_vals, Dy, Dystr = get_PPP(data, ps, nsim_N)


In [39]:
%%opts Layout [fig_size=200]

chain_number = 1
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,chain_number,j,k],
             true_value=data['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [40]:
%%opts Layout [fig_size=200]

chain_number = 0

plots = []
for j in range(data['J']):
        plots.append(plot_trace(ps['alpha'][:,chain_number,j],
             true_value=data['alpha'][j],
             title = 'Posterior distribution for alpha(%s)'%(j)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [41]:
%%opts Layout [fig_size=200]

chain_number = 1

plots = []
for j in range(data['K']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['Phi_cov'][:,chain_number,j,k],
             true_value=data['Phi_cov'][j,k],
             title = 'Posterior distribution for Phi_cov(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=True) # use same y-range for all plots?

layout.cols(2)


In [43]:
nsim = 100
PPP_vals, Dy, Dystr = get_PPP(data, ps, nsim)
100*np.sum(PPP_vals[:,0]<PPP_vals[:,1])/nsim

HBox(children=(IntProgress(value=0), HTML(value='')))




57.0