In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj
from codebase.post_process import * 

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv("../dat/muthen_men.csv")
df = df.replace(-9, np.nan).astype(float)
df.dropna(inplace=True)
df = df.astype(int)
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,4,6,7,7,6,7,7,7,7,4,7,7,3,7
1,3,4,6,6,5,5,3,5,5,6,7,5,5,6,6
2,2,3,4,5,5,3,3,3,2,6,3,4,5,6,4
3,7,6,6,5,5,5,5,5,3,3,3,3,5,6,6
4,6,6,5,5,6,6,4,4,4,7,5,6,3,4,4


In [3]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [4]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [5]:
log_dir = "./log/muthen_men/"
ps = load_obj('ps', log_dir)



In [6]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.842, 0.394, 0.479 ])
mb[3:6,1] = np.array([0.683, 0.078, 0.579 ])
mb[6:9,2] = np.array([0.748, 0.754, 0.575])
mb[9:12,3] = np.array([0.801, 0.708, 0.613])
mb[12:,4] = np.array([0.732, 0.672, 0.651])


In [7]:
muthen_results = dict()
muthen_results['beta'] = mb


In [8]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)

In [15]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [16]:
plots = []
# for i in range(data['J']):
#     for j in range(data['K']):
for i in range(2):
    for j in range(2):
            plots.append(plot_trace(acf(ps['Omega_beta'][:,i,j]),
                     title = 'Autocorrelation of Omega(%s,%s)'%(i,j)).options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?
layout.cols(2)

In [9]:
def get_residuals(ps_u, by_axis, absval = True, sort=False):
    """
    params
    ps_u posterior samples of u
    by_axis 0=residuals per iterm, 1=residuals per subject
    """
    mean_u = np.mean(ps_u, axis=0) #mean posterior residual matrix
    std_u = np.sqrt(np.mean(ps_u**2, axis=0)- mean_u**2) #std of posterior residual matrix
    if absval:
        res = pd.DataFrame(np.sum(np.abs(mean_u/std_u), axis=by_axis)).reset_index()
    else:
        res = pd.DataFrame(np.sum(mean_u/std_u, axis=by_axis)).reset_index()
    res.columns = ['index', 'residual']
    
    if sort:
        res.sort_values('residual', ascending=False, inplace=True)
    return res

get_residuals(ps['uu'], 1, True, True)


Unnamed: 0,index,residual
276,276,45.611230
436,436,45.101763
55,55,43.988874
351,351,43.818801
248,248,38.258317
524,524,35.740988
484,484,35.668521
163,163,34.199850
269,269,32.839398
559,559,30.037078


In [24]:
ps_u = ps['uu']
mean_u = np.mean(ps_u, axis=0) #mean posterior residual matrix
std_u = np.sqrt(np.mean(ps_u**2, axis=0)- mean_u**2) #std of posterior residual matrix
# res = pd.DataFrame(np.sum(mean_u/std_u, axis=by_axis)).reset_index()
r = np.mean(mean_u/std_u, axis=1)
z = (r-np.mean(r))/np.std(r)
z.shape

(578,)

In [33]:
res = pd.DataFrame(np.abs(z))
res.reset_index(inplace=True)
res.columns = ['subj_id', 'residual']
res.sort_values('residual', ascending=False, inplace = True)
hv.Bars(res[:20], hv.Dimension('index'), 'residual',\
        label='Top 20 Residuals').options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)


In [10]:
# %%opts Bars {+axiswise} [width=1000, height=300, ] 
res = get_residuals(ps['uu'], 1, True, True)

hv.Bars(res[:20], hv.Dimension('index'), 'residual',\
        label='Top 20 Residuals').options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)


In [16]:
df.iloc[[276, 55, 436, 351]]

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
284,7,1,1,1,7,1,1,1,7,1,1,7,1,1,1
55,7,1,1,3,7,1,1,1,7,1,1,6,1,1,2
445,7,1,1,1,7,1,1,1,7,1,1,7,1,1,1
359,7,1,1,1,7,1,1,1,7,1,1,7,1,1,1


In [15]:
df.iloc[[180]]

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
186,6,5,6,6,5,6,6,3,4,5,3,5,7,6,6
