In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj
from codebase.post_process import * 

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv("../dat/muthen_women.csv")
df = df.replace(-9, np.nan).astype(float)
df.dropna(inplace=True)
df = df.astype(int)
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,3,3,4,7,7,3,2,4,7,1,5,3,4,6
1,5,2,5,3,4,5,2,4,4,4,4,5,3,4,5
2,4,4,4,6,4,5,6,4,5,4,3,4,4,4,5
3,4,5,7,1,3,7,5,3,1,5,1,3,1,2,4
4,5,6,6,7,7,5,4,4,2,6,2,4,6,7,6


In [3]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values
data['y'][-int(data['N']*.1):] = 7

In [4]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [5]:
log_dir = "./log/muthen_women_exp2/"
# sm = load_obj('sm', log_dir)
# fit = load_obj('fit', log_dir)
ps = load_obj('ps', log_dir)


In [6]:
np.round(ps['beta'][180], 2)

array([[ 1.  , -0.05,  0.  ,  0.07,  0.1 ],
       [ 0.8 ,  0.01,  0.13,  0.02,  0.18],
       [ 1.32, -0.01,  0.03, -0.05,  0.06],
       [ 0.01,  1.  ,  0.02, -0.17,  0.22],
       [-0.23,  1.34,  0.04, -0.05, -0.12],
       [-0.09,  1.27, -0.1 , -0.09,  0.04],
       [-0.04,  0.12,  1.  , -0.04, -0.08],
       [ 0.02,  0.12,  0.7 , -0.05,  0.08],
       [ 0.11, -0.2 ,  0.87,  0.08,  0.06],
       [-0.26,  0.21,  0.11,  1.  , -0.08],
       [-0.01, -0.  ,  0.02,  1.08,  0.05],
       [ 0.03, -0.03,  0.09,  0.96,  0.04],
       [-0.12,  0.05,  0.04, -0.08,  1.  ],
       [ 0.02, -0.17,  0.02,  0.1 ,  0.9 ],
       [-0.02,  0.09,  0.1 , -0.04,  0.7 ]])

In [7]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [8]:
muthen_results = dict()
muthen_results['beta'] = mb


In [9]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)


In [10]:
# %%opts Layout [fig_size=200]
# plots = []
# for j in range(data['J']):
#     for k in range(data['K']):
#         plots.append(plot_trace(ps['beta'][:,j,k],
#              true_value=muthen_results['beta'][j,k],
#              title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
#                      options(fig_inches=8, aspect=3))
# layout = hv.Layout(plots).options(show_title = True,
#                                   vspace = .3,
#                                   absolute_scaling=False,
#                                   normalize=False) # use same y-range for all plots?

# layout.cols(2)


In [11]:
def get_residuals(ps_u, by_axis, absval = True, sort=False):
    """
    params
    ps_u posterior samples of u
    by_axis 0=residuals per iterm, 1=residuals per subject
    """
    mean_u = np.mean(ps_u, axis=0) #mean posterior residual matrix
    std_u = np.sqrt(np.mean(ps_u**2, axis=0)- mean_u**2) #std of posterior residual matrix
    if absval:
        res = pd.DataFrame(np.sum(np.abs(mean_u/std_u), axis=by_axis)).reset_index()
    else:
        res = pd.DataFrame(np.sum(mean_u/std_u, axis=by_axis)).reset_index()
    res.columns = ['subj_id', 'residual']
    
    if sort:
        res.sort_values('residual', ascending=False, inplace=True)
    return res

get_residuals(ps['uu'], 1, False, True).head()


Unnamed: 0,subj_id,residual
219,219,1.503446
431,431,1.405511
488,488,1.337766
460,460,1.290733
14,14,1.285837


In [12]:
# %%opts Bars {+axiswise} [width=1000, height=300, ] 
res = get_residuals(ps['uu'], 1, True, True)

hv.Bars(res[:20], hv.Dimension('subj_id'), 'residual',\
        label='Top 20 Residuals').options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)


In [13]:
data['N']-int(data['N']*.1)

610

In [14]:
res = get_residuals(ps['uu'], 1, False)
res['color'] = 'blue'
red_index = res[res.subj_id > (data['N']-int(data['N']*.1)-1)].index
res.loc[red_index, 'color'] = 'red'
res.sort_values('residual', ascending=False, inplace=True)
res.reset_index(drop=True, inplace=True)
res

Unnamed: 0,subj_id,residual,color
0,219,1.503446,blue
1,431,1.405511,blue
2,488,1.337766,blue
3,460,1.290733,blue
4,14,1.285837,blue
5,39,1.274952,blue
6,78,1.212298,blue
7,437,1.195110,blue
8,27,1.192709,blue
9,346,1.169195,blue


In [15]:
%%opts Overlay [fig_size=300]
plots = []
for color in ['blue', 'red']:
    tmp = res[res.color==color]
    plots.append(hv.Scatter((tmp.index,tmp.residual),
        ).\
                 options(fig_inches=4, aspect=2.5, s=10, color=color))
layout = hv.Overlay(plots).options(show_title = True,
                                  normalize=False) # use same y-range for all plots?

layout