In [77]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
def get_avg_probs(data, ps, m, L=1, c=0.2):
    ## compute the pi's for the the m-th posterior sample
    N = data['N']
    ystr = np.empty((L, N, data['J']))
    for l in range(L):
        z_mc = multivariate_normal.rvs(np.zeros(data['K']), ps['Phi_cov'][m], size = N)
        if 'uu' in ps.keys():
            u_mc = multivariate_normal.rvs(np.zeros(data['J']), np.eye(data['J'])*c**2, size = N)
        ystr[l] = ps['alpha'][m] + z_mc @ ps['beta'][m].T
        if 'uu' in ps.keys():
            ystr[l] = ystr[l] + u_mc
    # logit
    pistr = expit(ystr)

    # # probit
    # # pistr = norm.cdf(ystr)

    piavg = np.mean(pistr,0)
    
    lgscr = -data['DD']*np.log(piavg) - (1-data['DD'])*np.log(1-piavg)

    return np.sum(lgscr)

def get_lgscr(fold_index, ps, L):
    nsim_N = ps[fold_index]['alpha'].shape[0]
    lgscrs = np.empty(nsim_N)
    for m_ind in tqdm(range(nsim_N)):
        lgscrs[m_ind] = get_avg_probs(complete_data[fold_index]['test'], ps[fold_index], m=m_ind, L=L)

    return np.sum(lgscrs)/nsim_N    


In [79]:
num_chains = 1
num_samples = 1000
num_warmup = 1000
num_iter = num_samples + num_warmup

In [80]:
log_dir = "./log/bin_sim_2factor_logit_2/sim3/20200109_202544_bin2f_kfold_sim3_d3_m2//"
if log_dir[-1] != "/":
    print("\n\nAppending `/`-character at the end of directory")
    log_dir = log_dir+ "/"


In [81]:
print("\n\nChecking data integrity...\n\n")
complete_data = load_obj("complete_data", log_dir)

model_posterior_samples = dict()
model_posterior_samples[0] = load_obj('ps_0', log_dir)
model_posterior_samples[1] = load_obj('ps_1', log_dir)
model_posterior_samples[2] = load_obj('ps_2', log_dir)




Checking data integrity...




In [82]:
np.round(np.mean(model_posterior_samples[2]['beta'],0),2)

array([[ 1.  , -0.01],
       [ 1.65,  0.07],
       [ 1.06, -0.03],
       [ 0.05,  1.  ],
       [ 0.05,  1.33],
       [-0.05,  1.77]])

In [83]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  vector[K] zeros_K = rep_vector(0, K);
  vector[J] zeros_J = rep_vector(0, J);
  cov_matrix[J] I_J = diag_matrix(rep_vector(1, J));
  cov_matrix[K] I_K = diag_matrix(rep_vector(1, K));
  real<lower=0> c = 0.2;
}

parameters {
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  cov_matrix [K] Phi_cov;
  matrix[N,K] zz;
  matrix[N,J] uu;
}

transformed parameters{
  matrix[J,K] beta;
  matrix[N,J] yy;

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;
  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];
  // set the zero elements
  beta[4:J, 1] = beta_zeros[1:(J-3), 1];
  beta[1:(J-3), K] = beta_zeros[1:(J-3), K];

  for (n in 1:N) yy[n,] = to_row_vector(alpha) + zz[n,] * beta' + uu[n,];
}


In [84]:

print("\n\nLoading files...\n\n")


print("\n\nChecking data integrity...\n\n")

mcmc_length = model_posterior_samples[0]['alpha'].shape[0]

Ds = np.empty(3)
for fold_index in range(3):
    Ds[fold_index] = get_lgscr(fold_index, model_posterior_samples, L=1)

Ds



Loading files...




Checking data integrity...




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




array([1599.85866914, 1586.54713427, 1565.06535306])

In [85]:
np.round(np.mean(Ds))

1584.0

In [86]:
load_obj("data", log_dir)


{'random_seed': 0,
 'N': 1000,
 'K': 2,
 'J': 6,
 'alpha': array([0., 0., 0., 0., 0., 0.]),
 'beta': array([[1. , 0. ],
        [0.8, 0. ],
        [0.8, 0. ],
        [0. , 1. ],
        [0. , 0.8],
        [0. , 0.8]]),
 'Theta': array([[1. , 0. , 0. , 0. , 0. , 0. ],
        [0. , 1. , 0. , 0.2, 0.2, 0. ],
        [0. , 0. , 1. , 0.2, 0.2, 0. ],
        [0. , 0.2, 0.2, 1. , 0. , 0.2],
        [0. , 0.2, 0.2, 0. , 1. , 0.2],
        [0. , 0. , 0. , 0.2, 0.2, 1. ]]),
 'e': array([[ 0.07362804, -0.35804922,  3.8493088 ,  3.11745151,  1.05760669,
         -0.21831235],
        [ 1.14804991,  1.40820474, -0.1814078 ,  2.07484067,  0.6911965 ,
         -0.73138306],
        [ 0.43977892, -1.1254447 , -1.00673112, -0.54772119, -1.22090514,
          1.17261783],
        ...,
        [-0.48930738, -0.17425751,  3.22360071, -0.95691075,  2.63698591,
          1.55915334],
        [-0.05675429, -1.03914382, -1.14293677,  1.83042718, -2.23481193,
          1.00737145],
        [-1.36420608, -0