In [8]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
df = pd.read_csv("../dat/muthen_women.csv")
df = df.replace(-9, np.nan).astype(float)
df.dropna(inplace=True)
df = df.astype(int)
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,3,3,4,7,7,3,2,4,7,1,5,3,4,6
1,5,2,5,3,4,5,2,4,4,4,4,5,3,4,5
2,4,4,4,6,4,5,6,4,5,4,3,4,4,4,5
3,4,5,7,1,3,7,5,3,1,5,1,3,1,2,4
4,5,6,6,7,7,5,4,4,2,6,2,4,6,7,6


In [10]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [11]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [7]:
# log_dir = "./log/muthem_women/"
# # sm = load_obj('sm', log_dir)
# # fit = load_obj('fit', log_dir)
# ps = load_obj('ps', log_dir)



In [5]:
existing_directory = None
task_handle = 'muthen-women'

if existing_directory is None:
    print("\n\nCreating directory")
    nowstr = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_') # ISO 8601 format
    log_dir =  "./log/"+nowstr+"%s/" % task_handle

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)



Creating directory


In [7]:
with open('./codebase/stan_code/cont/CFA/marg_m.stan', 'r') as file:
        model_code = file.read()
print("\n\nCompiling model")
sm = pystan.StanModel(model_code=model_code, verbose=False)
print("\n\nSaving compiled model in directory %s"%log_dir)
save_obj(sm, 'sm', log_dir)

stan_data = dict(N = data['N'], K = data['K'], J = data['J'], yy = data['y'])

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_cd2dd7bc9309e7dd45d1c4288d47a1ff NOW.




Compiling model


Saving compiled model in directory ./log/20190708_111211_muthen-women/


In [8]:
print_model = True
if bool(print_model):
    print(model_code)


data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  matrix[N,J] yy;
}

transformed data{
  real<lower=0> c = 1;
  vector[J] zeros = rep_vector(0, J);
  cov_matrix[J] I_c = diag_matrix(rep_vector(c, J));
}

parameters {
  vector<lower=0>[J] sigma;
  vector<lower=0>[K] sigma_z;
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  cholesky_factor_corr[K] V_corr_chol;
  matrix[N,J] uu;
  cov_matrix[J] Omega;
}

transformed parameters{
  cov_matrix[J] Theta;
  matrix[J,K] beta;
  cov_matrix [K] Phi_cov ;
  cov_matrix[J] Marg_cov;
  
  Theta = diag_matrix(square(sigma));
  Phi_cov = multiply_lower_tri_self_transpose(diag_pre_multiply(sigma_z, V_corr_chol));

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  
  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;

  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  // set the zero elements
  beta[4

In [9]:
print("\n\nFitting model.... \n\n")

fit_run = sm.sampling(data=stan_data,
    iter=num_samples + num_warmup,
    warmup=num_warmup, chains=num_chains)

print("\n\nSaving fitted model in directory %s"%log_dir)
save_obj(fit_run, 'fit', log_dir)



Fitting model.... 




To run all diagnostics call pystan.check_hmc_diagnostics(fit)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)




Saving fitted model in directory ./log/20190708_111211_muthen-women/


Saving posterior samples in ./log/20190708_111211_muthen-women/


ValueError: No parameter Phi

In [17]:
print("\n\nSaving posterior samples in %s"%log_dir)
param_names = ['Marg_cov', 'beta', 'Phi_cov', 'alpha', 'sigma', "Theta", 'Marg_cov2', 'uu']

stan_samples= fit_run.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else:
    ps = stan_samples
save_obj(ps, 'ps', log_dir)




Saving posterior samples in ./log/20190708_111211_muthen-women/


In [25]:
ps = load_obj("ps", "./log/fabian_runs/20190712_005038_model0/")


In [26]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [27]:
muthen_results = dict()
muthen_results['beta'] = mb


In [28]:
std_betas = np.empty_like(ps['beta'])
for j in range(num_samples):
    b = ps['beta'][j]
    si = np.diag(ps['sigma_z'][j]**(-1))
    si2 = np.diag(np.std(data['y'], axis=0)**(-1))
    std_betas[j] = si2 @ b @ si

In [30]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [18]:
def get_residuals(ps_u, by_axis, absval = True, sort=False):
    """
    params
    ps_u posterior samples of u
    by_axis 0=residuals per iterm, 1=residuals per subject
    """
    mean_u = np.mean(ps_u, axis=0) #mean posterior residual matrix
    std_u = np.sqrt(np.mean(ps_u**2, axis=0)- mean_u**2) #std of posterior residual matrix
    if absval:
        res = pd.DataFrame(np.sum(np.abs(mean_u/std_u), axis=by_axis)).reset_index()
    else:
        res = pd.DataFrame(np.sum(mean_u/std_u, axis=by_axis)).reset_index()
    res.columns = ['index', 'residual']
    
    if sort:
        res.sort_values('residual', ascending=False, inplace=True)
    return res

get_residuals(ps['uu'], 1, True, True).head()


Unnamed: 0,index,residual
300,300,44.046158
414,414,42.996121
364,364,39.619497
386,386,37.751126
308,308,36.488856


In [19]:
# %%opts Bars {+axiswise} [width=1000, height=300, ] 
res = get_residuals(ps['uu'], 1, True, True)

hv.Bars(res[:20], hv.Dimension('index'), 'residual',\
        label='Top 20 Residuals').options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)


In [22]:
# %%output info=True 
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        if j!=k:
            plots.append(plot_trace(ps['Phi_cov'][:,j,k],
            title = 'Posterior distribution for Phi(%s,%s)'%(j,k)).options(fig_inches=10, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
#     tight = True, tight_padding=10,
    vspace = .5,
    absolute_scaling=False, fig_size=100)
layout.cols(2)

In [23]:
mle_est = dict()
mle_est['Sigma'] = np.cov(data['y'], rowvar=False)
mle_est['mu'] = np.mean(data['y'], axis=0)
def compute_D1(yy):   
    return multivariate_normal.logpdf(yy, mean=mle_est['mu'], cov=mle_est['Sigma'])

In [24]:
def compute_D2(yy, mcmc_iter):    
    D2 = multivariate_normal.logpdf(yy,
                               mean= ps['alpha'][mcmc_iter],
                               cov = ps['Marg_cov'][mcmc_iter])
    return D2

def compute_D(mcmc_iter, pred=True):
    if pred == True:
        y_pred=multivariate_normal.rvs(mean= ps['alpha'][mcmc_iter],
                        cov=ps['Marg_cov'][mcmc_iter],
                       size = data['y'].shape[0])
        return compute_D1(y_pred) - compute_D2(y_pred, mcmc_iter)

    else:
        return compute_D1(data['y']) - compute_D2(data['y'], mcmc_iter)
    
    
compute_D(100, False)

array([ 1.38000689e+00, -2.31209945e-01, -3.80924522e-01,  7.18602979e+00,
       -1.33741615e+00,  2.50268869e-01, -6.29550884e-01,  1.04554605e+00,
       -7.22497180e-01, -6.94663179e-01, -8.39881312e-01,  4.85514823e-01,
        1.07632688e+00,  2.47011711e+00,  2.48284156e+00,  1.01117077e+00,
        1.71173078e+00,  1.35877871e-02, -1.19627319e+00,  1.25604262e+00,
       -4.97069985e-02, -1.88251743e-01, -1.43389727e+00,  2.70252065e-01,
        1.62535799e-01,  9.26410873e-01, -5.42017551e-01,  2.55153507e+00,
        3.41772154e-01,  2.65776239e+00,  1.28898914e+00,  1.36323812e+01,
        2.74715231e+00,  1.37913208e-01, -1.03224867e+00, -1.10862143e+00,
       -9.47025435e-01,  9.46019134e-02,  7.81000821e+00,  1.19661709e+00,
       -1.10457685e+00, -3.70619180e-01, -1.14803343e+00,  5.50668515e-01,
       -1.29231424e+00, -3.98385780e-01, -5.07838025e-02, -8.55087111e-01,
       -6.49224627e-01,  5.78428893e-02,  3.36231451e-02,  2.07766799e-01,
        2.40198895e+00, -

In [25]:
mcmc_length = 1000
Ds = np.empty((mcmc_length,2))
for mcmc_iter in range(mcmc_length):
    Ds[mcmc_iter,0] = -2*np.sum(compute_D(mcmc_iter, pred=False))
    Ds[mcmc_iter,1] = -2*np.sum(compute_D(mcmc_iter, pred=True))

In [28]:
Ds[:,1]

array([ 774.47603536,  748.42044138,  668.02198707,  850.37493993,
        773.12522139,  843.94373592,  812.82011808,  859.43992283,
        862.18186065,  817.60953389,  941.37196381,  815.39851426,
        830.78097616,  836.95688356,  749.88723904,  801.22126836,
        814.09755598,  786.70351187,  808.12096509,  869.77143281,
        866.85575686,  846.32752795,  618.80064119,  747.62028955,
        752.81220012,  813.40574663,  758.09387012,  797.09104783,
        650.53503349,  689.84612171,  767.36279265,  859.11225733,
        694.31168659,  753.32428952,  711.27430293,  730.46010973,
        689.57570544,  705.33128208,  669.61645008,  622.01585039,
        747.90507817,  873.12900544,  931.11298885,  845.82376564,
        870.30750999,  951.66970993,  984.69645933, 1209.11339287,
       1027.96427774,  983.47084193, 1026.22534253, 1037.35444582,
        950.86983227,  790.92410627,  813.25397389,  755.68306016,
        574.8442504 ,  671.46341294,  657.0864447 ,  549.23842