In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
log_dir = "./log/20190913_235659_model2_std/"
data = load_obj('stan_data', log_dir)
# fit = load_obj('fit', log_dir)
ps = load_obj('ps', log_dir)



In [3]:
data

{'N': 677,
 'K': 5,
 'J': 15,
 'yy': array([[-1.64072605, -1.52790986, -2.05848956, ..., -0.68036567,
         -0.30265017,  0.83004284],
        [-0.85014133, -2.20486992, -0.52115097, ..., -0.68036567,
         -0.30265017,  0.142235  ],
        [-1.64072605, -0.8509498 , -1.28982027, ..., -0.00696281,
         -0.30265017,  0.142235  ],
        ...,
        [ 0.73102812, -0.17398973,  1.01618762, ..., -0.00696281,
         -0.90000926, -0.54557283],
        [ 0.73102812,  1.17993039,  0.24751833, ..., -0.00696281,
          1.4894271 , -0.54557283],
        [-0.85014133, -1.52790986,  0.24751833, ..., -0.68036567,
          0.29470892,  0.142235  ]]),
 'sigma_prior': array([1.21365356, 1.31119056, 1.88979237, 1.49014482, 1.247293  ,
        2.10853325, 1.51831912, 1.48596286, 1.24257448, 1.50253778,
        1.46432958, 1.44212651, 1.60626715, 1.42177222, 1.52189356])}

In [4]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [5]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [6]:
muthen_results = dict()
muthen_results['beta'] = mb


In [7]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [18]:
def get_residuals(ps_u, by_axis, absval = True, sort=False):
    """
    params
    ps_u posterior samples of u
    by_axis 0=residuals per iterm, 1=residuals per subject
    """
    mean_u = np.mean(ps_u, axis=0) #mean posterior residual matrix
    std_u = np.sqrt(np.mean(ps_u**2, axis=0)- mean_u**2) #std of posterior residual matrix
    if absval:
        res = pd.DataFrame(np.sum(np.abs(mean_u/std_u), axis=by_axis)).reset_index()
    else:
        res = pd.DataFrame(np.sum(mean_u/std_u, axis=by_axis)).reset_index()
    res.columns = ['index', 'residual']
    
    if sort:
        res.sort_values('residual', ascending=False, inplace=True)
    return res

get_residuals(ps['uu'], 1, True, True).head()


Unnamed: 0,index,residual
300,300,44.046158
414,414,42.996121
364,364,39.619497
386,386,37.751126
308,308,36.488856


In [19]:
# %%opts Bars {+axiswise} [width=1000, height=300, ] 
res = get_residuals(ps['uu'], 1, True, True)

hv.Bars(res[:20], hv.Dimension('index'), 'residual',\
        label='Top 20 Residuals').options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)


In [10]:
# %%output info=True 
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        if j!=k:
            plots.append(plot_trace(ps['Phi_cov'][:,j,k],
            title = 'Posterior distribution for Phi(%s,%s)'%(j,k)).options(fig_inches=10, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
#     tight = True, tight_padding=10,
    vspace = .5,
    absolute_scaling=False, fig_size=100)
layout.cols(2)

In [12]:

log_dir0 = "./log/20190908_141720_women-model0/"
log_dir2 = "./log/20190908_141635_women-model2/"


# data0 = load_obj('data', log_dir0)
ps0 = load_obj('ps', log_dir0)

# data2 = load_obj('data', log_dir2)
ps2 = load_obj('ps', log_dir2)



In [13]:
from numpy.linalg import det, inv
def ff2(yy, model_mu, model_Sigma, p=15, q=5):
    sample_S = np.cov(yy, rowvar=False)
    ldS = np.log(det(sample_S))
    iSigma = inv(model_Sigma)
    ldSigma = np.log(det(model_Sigma))
    n_data = yy.shape[0]
    ff2 =(n_data-1)*(ldSigma+np.sum(np.diag(sample_S @ iSigma))-ldS-p)
    return ff2

In [14]:
def compute_D(post_samples, mcmc_iter, pred=True):
    
    if 'Marg_cov2' in post_samples.keys():
        marg_cov = 'Marg_cov2'
    else:
        marg_cov = 'Marg_cov'

    if 'Sigma' in post_samples.keys():
        marg_cov = 'Sigma'
    else:
        pass
    
    
    if pred == True:
        y_pred=multivariate_normal.rvs(mean= post_samples['alpha'][mcmc_iter],
                        cov=post_samples[marg_cov][mcmc_iter],
                       size = data['y'].shape[0])
        return ff2(y_pred, post_samples['alpha'][mcmc_iter], post_samples[marg_cov][mcmc_iter])

    else:
        return ff2(data['y'], post_samples['alpha'][mcmc_iter], post_samples[marg_cov][mcmc_iter])



In [16]:
mcmc_length = ps0['alpha'].shape[0]
Ds = np.empty((mcmc_length,2))
for mcmc_iter in range(mcmc_length):
    Ds[mcmc_iter,0] = compute_D(ps0, mcmc_iter, pred=False)
    Ds[mcmc_iter,1] = compute_D(ps0, mcmc_iter, pred=True)


print(np.sum(Ds[:,0] < Ds[:,1]) / mcmc_length)


0.141


In [17]:
Ds = np.empty((mcmc_length,2))
for mcmc_iter in range(mcmc_length):
    Ds[mcmc_iter,0] = compute_D(ps2, mcmc_iter, pred=False)
    Ds[mcmc_iter,1] = compute_D(ps2, mcmc_iter, pred=True)


print(np.sum(Ds[:,0] < Ds[:,1]) / mcmc_length)


0.0
