In [178]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [179]:
get_avg_probs(complete_data[fold_index]['test'], model_posterior_samples[fold_index], 0, 0.4, 1)

array([[0.51181525, 0.60609747, 0.3464872 , 0.68040391, 0.54708026,
        0.46778698],
       [0.37411383, 0.65495432, 0.38959692, 0.60033664, 0.83315254,
        0.74677492],
       [0.54661188, 0.61347008, 0.63342206, 0.41746335, 0.33318507,
        0.4092692 ],
       ...,
       [0.48669687, 0.43166055, 0.37717889, 0.60266249, 0.61694272,
        0.81767478],
       [0.53967258, 0.61744314, 0.5356735 , 0.35243497, 0.35253977,
        0.56826925],
       [0.56491028, 0.51093277, 0.6563597 , 0.40227643, 0.27230401,
        0.2739961 ]])

In [180]:
def get_prob_pred(data, ps, m, c=.4):
    N = data['N']
    pistr = np.empty((N, data['J']))
    z_mc = multivariate_normal.rvs(np.zeros(data['K']),
        ps['Phi_cov'][m], size = N)
    ystr = ps['alpha'][m] + z_mc @ ps['beta'][m].T
    if 'uu' in ps.keys():
        if 'Omega_cov' in ps.keys():
            u_mc = multivariate_normal.rvs(np.zeros(data['J']),
                ps['Omega_cov'][m], size = N)
        else:
            u_mc = multivariate_normal.rvs(np.zeros(data['J']),
                np.eye(data['J'])*c**2, size = N)
        ystr = ystr + u_mc

    # logit
    pistr = expit(ystr)

    # probit
    # pistr = norm.cdf(ystr)

    return pistr

def get_avg_probs(data, ps, m, c, L = 10):
    avg_prob = np.empty((L, data['N'], data['J']))
    for l in range(L):
        avg_prob[l] = get_prob_pred(data, ps, m, c)
    return np.mean(avg_prob, 0)
        
#     lgscr = -data['DD']*np.log(piavg) - (1-data['DD'])*np.log(1-piavg)

#     return np.sum(lgscr)





def get_lgscr(data, ps, c, L):
    nsim_N = ps['alpha'].shape[0]
    lgscrs = np.empty(nsim_N)
    for m_ind in tqdm(range(nsim_N)):
        piavg = get_avg_probs(data, ps, m=m_ind, c=c, L = L )   
        lgscr =-data['DD']*np.log(piavg) - (1-data['DD'])*np.log(1-piavg)
        lgscrs[m_ind] = np.sum(lgscr)

    return np.sum(lgscrs)/nsim_N    


In [181]:
num_chains = 1
num_samples = 1000
num_warmup = 1000
num_iter = num_samples + num_warmup

In [182]:
log_dir = "./log/20200109_202506_bin2f_kfold_sim0_d3_m1//"
if log_dir[-1] != "/":
    print("\n\nAppending `/`-character at the end of directory")
    log_dir = log_dir+ "/"


In [183]:
print("\n\nChecking data integrity...\n\n")
complete_data = load_obj("complete_data", log_dir)

model_posterior_samples = dict()
model_posterior_samples[0] = load_obj('ps_0', log_dir)
model_posterior_samples[1] = load_obj('ps_1', log_dir)
model_posterior_samples[2] = load_obj('ps_2', log_dir)




Checking data integrity...




In [184]:
np.round(np.mean(model_posterior_samples[0]['beta'],0),2)

array([[1.  , 0.  ],
       [0.7 , 0.  ],
       [1.42, 0.  ],
       [0.  , 1.  ],
       [0.  , 1.66],
       [0.  , 1.18]])

In [185]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  vector[K] zeros_K = rep_vector(0, K);
  cov_matrix[K] I_K = diag_matrix(rep_vector(1, K));
}

parameters {
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  cov_matrix [K] Phi_cov;
  matrix[N,K] zz;
}

transformed parameters{
  matrix[J,K] beta;
  matrix[N,J] yy;

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  
  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;
  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  for (n in 1:N) yy[n,] = to_row_vector(alpha) + zz[n,] * beta';
}
  
model {
  to_vector(beta_free) ~ normal(0, 1);
  to_vector(alpha) ~ normal(0, 10);
  Phi_cov ~ inv_wishart(J+4, I_K);
  for (n in 1:N) to_vector(zz[n,]) ~ multi_normal(zeros_K, Phi_cov);
  for (j in 1:J) DD[, j] ~ bernoulli_logit(yy[, j]);
  
}



In [186]:

print("\n\nLoading files...\n\n")


print("\n\nChecking data integrity...\n\n")

mcmc_length = model_posterior_samples[0]['alpha'].shape[0]

Ds = np.empty(3)
for fold_index in range(3):
    Ds[fold_index] = get_lgscr(complete_data[fold_index]['test'], model_posterior_samples[fold_index], c=0.2, L=10)

Ds



Loading files...




Checking data integrity...




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




array([1400.35504378, 1393.58106045, 1395.06555097])

In [187]:
np.round(np.mean(Ds),2)

1396.33