In [3]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [32]:
np.random.seed(121)
nsim_data = 500
J = 6
K = 2
alpha = np.array([1,2,.3,-.8, 1, -1.4])

beta = np.array([[1,0], [.2, 0],[.6,0],[0,1], [0,.5], [0,.8]], dtype=float)
sigma_z = np.array([1.2, .7])
V_corr = np.eye(K)
rho = .5
V_corr[0,1] = rho
V_corr[1,0] = rho
V_corr
V = np.diag(sigma_z) @ V_corr @  np.diag(sigma_z)

sigma_e = np.array([1,1.2,.9,.8, 1, 1.4])
Sigma_e = np.diag(sigma_e**2)

Omega = beta @ V @ beta.T + Sigma_e
yy = multivariate_normal.rvs(mean = alpha, cov=Omega, size=nsim_data)


In [33]:
data = dict()
data['N'] = nsim_data
data['K'] = K
data['J'] = J

data['alpha'] = alpha

data['beta'] = beta
data['sigma_z'] = sigma_z

data['V_corr'] = V_corr
data['V'] = V
data['Omega'] = Omega

data['Sigma_e'] = Sigma_e
data['sigma_e'] = sigma_e

data['y'] = yy
data['y'][-50:] = 3


In [34]:
pd.DataFrame(data['y'])

Unnamed: 0,0,1,2,3,4,5
0,0.847462,1.339901,1.157198,-1.049360,1.901608,-0.932000
1,0.103027,1.558951,-2.512203,-1.756038,-0.121707,-2.986154
2,-0.622366,3.494829,-0.093137,0.751460,2.031040,-2.024757
3,0.958071,2.461128,-0.363061,-2.068263,1.155293,-0.505880
4,5.420059,4.477596,2.882532,1.945838,0.169548,-0.505785
5,1.227060,1.678254,-0.622812,-2.474361,1.657812,-2.052192
6,4.322130,1.754413,1.411386,1.257683,4.335446,-0.000792
7,0.560558,0.164566,-0.155826,-1.666053,0.621957,-1.895722
8,1.904568,3.117869,0.479808,-1.376979,0.970956,0.865345
9,2.039378,3.035403,0.460558,-1.154521,-0.427551,-2.879767


In [30]:
pd.DataFrame(data['y']).describe()

Unnamed: 0,0,1,2,3,4,5
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,1.248901,2.108041,0.565994,-0.376365,1.186823,-0.912246
std,1.560896,1.186209,1.362553,1.477317,1.168208,1.970585
min,-3.234283,-2.060644,-3.407843,-3.88542,-1.891538,-6.291473
25%,0.167336,1.283287,-0.396454,-1.356465,0.349128,-2.325549
50%,1.412551,2.235323,0.476792,-0.58143,1.096698,-1.064942
75%,2.391945,3.0,1.391724,0.173985,2.011489,0.149941
max,6.540248,5.788249,3.717173,3.0,4.335446,3.171218


In [4]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [6]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [7]:
log_dir = "./log/muthem_women/"
# sm = load_obj('sm', log_dir)
# fit = load_obj('fit', log_dir)
ps = load_obj('ps', log_dir)



In [8]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [9]:
muthen_results = dict()
muthen_results['beta'] = mb


In [10]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)

In [11]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [15]:
def get_residuals(ps_u, by_axis, absval = True, sort=False):
    """
    params
    ps_u posterior samples of u
    by_axis 0=residuals per iterm, 1=residuals per subject
    """
    mean_u = np.mean(ps_u, axis=0) #mean posterior residual matrix
    std_u = np.sqrt(np.mean(ps_u**2, axis=0)- mean_u**2) #std of posterior residual matrix
    if absval:
        res = pd.DataFrame(np.sum(np.abs(mean_u/std_u), axis=by_axis)).reset_index()
    else:
        res = pd.DataFrame(np.sum(mean_u/std_u, axis=by_axis)).reset_index()
    res.columns = ['index', 'residual']
    
    if sort:
        res.sort_values('residual', ascending=False, inplace=True)
    return res

get_residuals(ps['uu'], 1, True, True).head()


Unnamed: 0,index,residual
414,414,46.868892
300,300,46.457694
364,364,40.67129
386,386,40.036613
308,308,38.051297


In [16]:
# %%opts Bars {+axiswise} [width=1000, height=300, ] 
res = get_residuals(ps['uu'], 1, True, True)

hv.Bars(res[:20], hv.Dimension('index'), 'residual',\
        label='Top 20 Residuals').options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)


In [17]:
# %%output info=True 
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        if j!=k:
            plots.append(plot_trace(ps['V_corr'][:,j,k],
            title = 'Posterior distribution for V_corr(%s,%s)'%(j,k)).options(fig_inches=10, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
#     tight = True, tight_padding=10,
    vspace = .5,
    absolute_scaling=False, fig_size=100)
layout.cols(2)