In [47]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
log_dir = "./log/FND/CV/20200413_042953_FNDcv_s1m6//"
if log_dir[-1] != "/":
    print("\n\nAppending `/`-character at the end of directory")
    log_dir = log_dir+ "/"
data = load_obj("data", log_dir)

In [49]:
print("\n\nChecking data integrity...\n\n")
complete_data = load_obj("complete_data", log_dir)

model_posterior_samples = dict()
model_posterior_samples[0] = load_obj('ps_0', log_dir)
model_posterior_samples[1] = load_obj('ps_1', log_dir)
model_posterior_samples[2] = load_obj('ps_2', log_dir)




Checking data integrity...




In [50]:
data

{'D': array([[1, 0, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [1, 0, 0, 1, 1, 1],
        ...,
        [1, 1, 1, 1, 0, 0],
        [1, 0, 0, 1, 0, 1],
        [1, 0, 1, 1, 0, 0]]), 'N': 565, 'J': 6, 'K': 2}

In [51]:
model_posterior_samples[0].keys()

odict_keys(['alpha', 'yy', 'beta', 'Marg_cov', 'Omega_cov'])

In [52]:
num_chains = 4
num_samples = model_posterior_samples[0]['alpha'].shape[0]
num_warmup = model_posterior_samples[0]['alpha'].shape[0]
num_iter = num_samples + num_warmup

In [53]:
for j in range(3):
    ps = model_posterior_samples[j]    
    for chain_number in range(num_chains):
        for i in range(num_samples):
            sign1 = np.sign(ps['beta'][i,chain_number,0,0])
            sign2 = np.sign(ps['beta'][i,chain_number,3,1])
            ps['beta'][i,chain_number,:3,0] = ps['beta'][i,chain_number,:3,0] * sign1
            ps['beta'][i,chain_number,3:,1] = ps['beta'][i,chain_number,3:,1] * sign2
            if 'Phi_cov' in ps.keys():
                ps['Phi_cov'][i,chain_number,0,1] = sign1 * sign2 * ps['Phi_cov'][i,chain_number,0,1]
                ps['Phi_cov'][i,chain_number,1,0] = ps['Phi_cov'][i,chain_number,0,1]
    model_posterior_samples[j] = ps

In [54]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  cov_matrix[J] I_J = diag_matrix(rep_vector(1, J));
}

parameters {
  vector[J] alpha;
  matrix[J,K] beta;
  matrix[N,J] yy;
  cov_matrix[J] Omega_cov;
}

transformed parameters{
  cov_matrix[J] Marg_cov;
  Marg_cov = beta * beta'+ Omega_cov;
}

model {
  to_vector(beta) ~ normal(0, 1);
  to_vector(alpha) ~ normal(0, 10);
  Omega_cov ~ inv_wishart(J+6, I_J);
  for (n in 1:N) yy[n,] ~ multi_normal(alpha, Marg_cov);
  for (j in 1:J) DD[, j] ~ bernoulli_logit(yy[, j]);
}

generated quantities{
  matrix[J,J] betabeta =  beta * beta';
}



In [55]:
%%opts Layout [fig_size=200]

chain_number = 2
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(model_posterior_samples[1]['beta'][:,chain_number,j,k],
#              true_value=data['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [56]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal, bernoulli, norm
import datetime
import sys
import os
import itertools

from codebase.file_utils import save_obj, load_obj
from scipy.special import expit
from scipy.special import ndtri

def to_str_pattern(y0):
    if np.ndim(y0) == 1:
        return ''.join(y0.astype(str))
    if np.ndim(y0) == 2:
        y = pd.DataFrame(y0)
        yresp = y.apply(lambda x: ''.join(x.astype(str)), axis=1)
        return yresp


def to_nparray_data(yresp):
    if type(yresp) == str:
        return np.array(list(yresp)).astype(int)
    else:
        J = len(yresp[0])
        N = yresp.shape[0]
        res = np.empty((N,J))
        for i in range(N):
            res[i] = np.array(list(yresp[i])).astype(int)
        return res


def get_all_possible_patterns(n):
    lst = list(map(list, itertools.product([0, 1], repeat=n)))
    return to_str_pattern(lst)


def get_probs2(data, ps, m, cn):
    pistr = expit(ps['yy'][m, cn])
    return pistr


def get_Ey(data_ptrn, prob, N):
    distinct_patterns = np.unique(data_ptrn)
    ## compute E_y(theta) for a specific pattern y
    Ey = dict()
    for ptrn in distinct_patterns:
        prob_matrix = bernoulli.logpmf(k=to_nparray_data(ptrn), p = prob)
        Ey[ptrn] = N * np.mean(np.exp(np.sum(prob_matrix,1)),0)
    return Ey


def get_Oy(data_ptrn):
    distinct_patterns = np.unique(data_ptrn)
    # compute observed pattern occurences
    Oy = dict()
    for ptrn in distinct_patterns:
        Oy[ptrn] = np.count_nonzero(data_ptrn == ptrn)
    return Oy


def get_Dy(Oy, Ey, data_ptrn):
    distinct_patterns = np.unique(data_ptrn)
    # compute the discrepancy D
    Dy = dict()
    for ptrn in distinct_patterns:
        Dy[ptrn] = Oy[ptrn] * np.log(Oy[ptrn]/Ey[ptrn])

    return Dy


def get_lgscr(data, ps, nsim = 100):

    nsim_N = ps['alpha'].shape[0]
    skip_step = int(nsim_N/nsim)
    
    data_ptrn = to_str_pattern(data['DD'])
    Oy = get_Oy(data_ptrn)

    lgscr_vals = np.empty((nsim,4))
    for m_ind in tqdm(range(nsim)):
        m = skip_step*m_ind
        # compute Dy
        for cn in range(4):
            pi = get_probs2(data, ps, m, cn)
            Ey = get_Ey(data_ptrn, pi, data['N'])
            Dy = get_Dy(Oy, Ey, data_ptrn)
            lgscr_vals[m_ind, cn] = sum(Dy.values())

    return lgscr_vals, Dy



In [57]:
nsim = 100
Ds = np.empty((3,4))
for fold_index in range(3):
    lgscr_vals, Dy = get_lgscr(complete_data[fold_index]['test'], model_posterior_samples[fold_index], nsim)
    Ds[fold_index] = np.mean(lgscr_vals,0) #for each chain take the mean log_score across the MCMC iters

np.sum(Ds, axis=0) # for each chain, sum the log_scores across 3 folds


HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))




array([120.79788392, 121.50146462, 119.9212418 , 121.45980445])

In [58]:
Ds[0]

array([37.10621887, 37.41358304, 36.80826584, 37.95867891])

In [59]:
np.round(np.mean(np.sum(Ds, axis=0)),4) # take the mean sum log-score

120.9201

In [60]:
#m1 = 126
#m2 = 123
#m5 = 121
#m6 = 121