In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj
from codebase.post_process import * 

%matplotlib inline

%load_ext autoreload
%autoreload 2

  from pandas.core import datetools


In [2]:
df = pd.read_csv("../dat/muthen_women.csv")
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,3,3,4,7,7,3,2,4,7,1,5,3,4,6
1,5,2,5,3,4,5,2,4,4,4,4,5,3,4,5
2,4,4,4,6,4,5,6,4,5,4,3,4,4,4,5
3,4,5,7,1,3,7,5,3,1,5,1,3,1,2,4
4,5,6,6,7,7,5,4,4,2,6,2,4,6,7,6


In [3]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [6]:
num_chains = 1
num_samples = 1000
num_warmup = 1000
num_iter = num_samples + num_warmup

In [4]:
log_dir = "./log/20190425_231717_CFA_NN_marg_muthen_women/"
sm = load_obj('sm', log_dir)
fit = load_obj('fit', log_dir)



In [7]:
param_names = ['Omega_beta', 'beta', 'V_corr', 'V' , 'uu', 'alpha', 'sigma', 'sigma_z']

stan_samples= fit.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else: 
    ps = stan_samples

In [8]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [9]:
muthen_results = dict()
muthen_results['beta'] = mb


In [10]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)

In [11]:
ps['beta'][100]

array([[ 1.        ,  0.01403815, -0.07181508, -0.13030168,  0.07816128],
       [ 0.44033758, -0.01684166,  0.00781829,  0.02069125,  0.03868355],
       [ 1.05286765,  0.06892391,  0.0707616 ,  0.08484166,  0.08047892],
       [ 0.03636747,  1.        ,  0.11342031, -0.0746454 ,  0.15941153],
       [-0.03926159,  1.26986649, -0.11326213, -0.23172321, -0.13071592],
       [-0.03920422,  1.69881387, -0.0391429 , -0.04315386,  0.03767175],
       [-0.21389102, -0.00393688,  1.        ,  0.13188621,  0.07378138],
       [ 0.11208013,  0.13400863,  0.83411605, -0.03487476,  0.04948643],
       [-0.00894383, -0.11574149,  0.87952915, -0.23124923, -0.01084516],
       [-0.05646555,  0.10801942,  0.03834092,  1.        ,  0.06776832],
       [-0.0373146 , -0.02297198, -0.12911195,  0.85266927, -0.04920121],
       [ 0.04325194, -0.08085073,  0.01506691,  0.48945662, -0.10901462],
       [-0.04269204, -0.08415683,  0.02917592, -0.1104946 ,  1.        ],
       [-0.10608376, -0.03796795, -0.0

In [14]:
%%opts Curve {+axiswise} [width=600, height=200, ] 
plots = []
for i in range(data['J']):
    for j in range(data['K']):
            plots.append(plot_trace(ps['beta'][:,i,j],
                     true_value=muthen_results['beta'][i,j],
                     title = 'Posterior distribution for beta(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [13]:
%%opts Curve {+axiswise} [width=600, height=200, ] 
plots = []
for i in range(data['J']):
        plots.append(plot_trace(ps['sigma'][:,i],
                     true_value=None,
                     title = 'Posterior distribution for sigma(%s)'%(i)))
layout = hv.Layout(plots)
layout.cols(1)

In [16]:
def get_residuals(ps_u, by_axis, absval = True, sort=False):
    """
    params
    ps_u posterior samples of u
    by_axis 0=residuals per iterm, 1=residuals per subject
    """
    mean_u = np.mean(ps_u, axis=0) #mean posterior residual matrix
    std_u = np.sqrt(np.mean(ps_u**2, axis=0)- mean_u**2) #std of posterior residual matrix
    if absval:
        res = pd.DataFrame(np.sum(np.abs(mean_u/std_u), axis=by_axis)).reset_index()
    else:
        res = pd.DataFrame(np.sum(mean_u/std_u, axis=by_axis)).reset_index()
    res.columns = ['index', 'residual']
    
    if sort:
        res.sort_values('residual', ascending=False, inplace=True)
    return res

get_residuals(ps['uu'], 1, True, True).head()


Unnamed: 0,index,residual
307,307,39.35647
421,421,39.112538
509,509,37.606949
315,315,33.75557
371,371,33.427889


In [17]:
%%opts Bars {+axiswise} [width=1000, height=300, ] 
res = get_residuals(ps['uu'], 1, True, True)

hv.Bars(res[:20], hv.Dimension('index'), 'residual',\
        label='Top 20 Residuals').options(color='blue', xrotation=90)


In [18]:
%%opts BoxWhisker {+axiswise} [width=1000, height=300, ] 

plots = []

for j in range(15):
    plots.append(hv.BoxWhisker((j, ps['uu'][:,421,j]), 'Index', 'Value',\
                               label='Subject 160').options(box_color='blue', box_alpha=0.4, xrotation=90))
    plots.append(hv.BoxWhisker((j, ps['uu'][:,307,j]), 'Index', 'Value',\
                           label='Subject 141').options(box_color='red', box_alpha=0.4, xrotation=90))
layout = hv.Overlay(plots)
layout.cols(1)

In [19]:
%%opts Curve {+axiswise} [width=600, height=200, ] 
plots = []
for i in range(data['K']):
    for j in range(data['K']):
        if i<j:
            plots.append(plot_trace(ps['V_corr'][:,i,j],
                     true_value=None,
                     title = 'Posterior distribution for V-correlations(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)