In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj
from codebase.post_process import * 

%matplotlib inline

%load_ext autoreload
%autoreload 2

  from pandas.core import datetools


In [2]:
df = pd.read_csv("../dat/muthen_men.csv")
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,4,6,7,7,6,7,7,7,7,4,7,7,3,7
1,3,4,6,6,5,5,3,5,5,6,7,5,5,6,6
2,2,3,4,5,5,3,3,3,2,6,3,4,5,6,4
3,7,6,6,5,5,5,5,5,3,3,3,3,5,6,6
4,6,6,5,5,6,6,4,4,4,7,5,6,3,4,4


In [3]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [4]:
stan_data = dict(N = data['N'], K = data['K'], J = data['J'], yy = data['y'])

In [10]:
with open('./codebase/stan_code/cont/CFA/marg_m.stan', 'r') as file:
    model_code = file.read()
print(model_code)

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  matrix[N,J] yy;
}

transformed data{
  real<lower=0> c = 1;
  vector[J] zeros = rep_vector(0, J);
  cov_matrix[J] I_c = diag_matrix(rep_vector(c, J));
}

parameters {
  vector<lower=0>[J] sigma;
  vector<lower=0>[K] sigma_z;
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  cholesky_factor_corr[K] V_corr_chol;
  matrix[N,J] uu;
  cov_matrix[J] Sigma_u;
}

transformed parameters{
  cov_matrix[J] Sigma_epsilon;
  matrix[J,K] beta;
  cov_matrix [K] V ;
  cov_matrix[J] Omega;
  
  Sigma_epsilon = diag_matrix(square(sigma));
  V = multiply_lower_tri_self_transpose(diag_pre_multiply(sigma_z, V_corr_chol));

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  
  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;

  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  // set the zero elements
  bet

In [11]:
sm = pystan.StanModel(model_code=model_code, verbose=False)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_9498c524cd6a14b9b17833cb43c87e84 NOW.


In [15]:
nowstr = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_') # ISO 8601 format
task_id = 'CFA_NN_marg_muthen_men'
log_dir =  "./log/"+nowstr+"%s/" % task_id

In [16]:
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [12]:
fit_opt = sm.optimizing(data=stan_data)


In [17]:
num_chains = 1
num_samples = 1000
num_warmup = 1000
num_iter = num_samples + num_warmup

In [18]:
fit_run = sm.sampling(data=stan_data, iter=num_iter, warmup=num_warmup, chains=num_chains)

To run all diagnostics call pystan.check_hmc_diagnostics(fit)


In [19]:
save_obj(sm, 'sm', log_dir)
save_obj(fit_run, 'fit', log_dir)
fit=fit_run

The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [20]:
print(log_dir)

./log/20190421_092254_CFA_NN_marg_muthen_men/


In [21]:
# log_dir = "./log/20190417_184609_CFA_NN_aug_hier_muthen_men/"
# sm = load_obj('sm', log_dir)
# fit = load_obj('fit', log_dir)



In [22]:
param_names = ['Omega_beta', 'beta', 'V', 'uu', 'alpha', 'sigma', 'sigma_z']

stan_samples= fit.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else: 
    ps = stan_samples

In [23]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.842, 0.394, 0.479 ])
mb[3:6,1] = np.array([0.683, 0.078, 0.579 ])
mb[6:9,2] = np.array([0.748, 0.754, 0.575])
mb[9:12,3] = np.array([0.801, 0.708, 0.613])
mb[12:,4] = np.array([0.732, 0.672, 0.651])


In [24]:
muthen_results = dict()
muthen_results['beta'] = mb


In [25]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)

In [26]:
muthen_results['beta'][:,0]

array([0.842, 0.394, 0.479, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   ])

In [27]:
our_results['beta'].shape

(15, 5)

In [28]:
for i in range(5):
    print("%d,%d= %.1f"%(3*i,i,ps['beta'][0,3*i,i]))

0,0= 1.0
3,1= 1.0
6,2= 1.0
9,3= 1.0
12,4= 1.0


In [30]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
    for j in range(data['K']):
            plots.append(plot_trace(ps['Omega_beta'][:,i,j],
                     true_value=None,
                     title = 'Posterior distribution for Omega_beta(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [31]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
    for j in range(data['K']):
            plots.append(plot_trace(acf(ps['Omega_beta'][:,i,j]),
                     title = 'Autocorrelation of Omega(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [32]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
    for j in range(data['K']):
            plots.append(plot_trace(ps['beta'][:,i,j],
                     true_value=muthen_results['beta'][i,j],
                     title = 'Posterior distribution for beta(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [16]:
# %%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
# plots = []
# for i in range(data['J']):
#             plots.append(plot_trace(ps['alpha'][:,i],
#                      true_value=data['alpha'][i],
#                      title = 'Posterior distribution for mu(%s)'%(i)))
# layout = hv.Layout(plots)
# layout.cols(1)

In [17]:
# %%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
# plots = []
# for i in range(data['J']):
#             plots.append(plot_trace(acf(ps['alpha'][:,i]),
#                      title = 'Autocorrelation of alpha(%s)'%(i)))
# layout = hv.Layout(plots)
# layout.cols(1)

In [8]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
            plots.append(plot_trace(ps['sigma'][:,i],
                     true_value=None,
                     title = 'Posterior distribution for sigma(%s)'%(i)))
layout = hv.Layout(plots)
layout.cols(1)

In [22]:
mean_u = np.mean(ps['uu'], axis=0)
std_u = np.sqrt(np.mean(ps['uu']**2, axis=0)- mean_u**2)

In [26]:
np.sum(np.abs(mean_u/std_u), axis=1)

array([13.67469331,  9.73316263, 11.00438124, 13.10064953,  9.02322801,
       20.05580995,  8.59292705,  8.32331132,  4.63494106, 31.98576231,
       19.30256228,  1.77980745, 16.95309988, 16.72073664, 13.50480088,
       24.43395243,  4.45530801, 19.2361424 ,  2.24217994,  4.72769276,
       22.18340528, 10.24295799,  3.98546383,  3.55763526,  6.58895758,
        5.81402984, 12.7769797 ,  8.94203084,  4.45435066, 16.75139933,
        3.17268113, 20.16544921, 13.70906448,  9.48974067, 15.67420605,
        4.84887374, 15.19452891, 22.25435851, 15.4290182 ,  7.42618081,
       19.77492916,  4.42490119,  2.38441356,  2.01715775,  2.90038219,
       15.02575065, 17.78115131,  5.89291592, 13.64022797, 14.69076084,
       17.9187109 , 16.94938767,  5.16438092,  7.90268531, 14.56867968,
       55.30924237, 15.64125251, 22.61830588,  5.86372281, 13.54850871,
        9.38642278, 10.47030283, 16.76167766,  7.34962746,  1.65988992,
       15.08425805, 11.99086926, 16.82660858,  5.77112174, 12.56

In [33]:
outl = np.sum(mean_u/std_u, axis=1)
outl[np.abs(outl)>10].shape

(163,)

In [18]:
%%opts BoxWhisker {+axiswise} [width=1000, height=300, tools=['hover']] 

plots = []
for i in range(15):
    plots.append(hv.BoxWhisker((i, ps['uu'][:,2,i]), 'Index', 'Value',\
                               label='Standard Data').options(box_color='blue', xrotation=90))
layout = hv.Overlay(plots)
layout.cols(1)

In [11]:
# %%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
# plots = []
# for i in range(data['K']):
#             plots.append(plot_trace(ps['sigma_z'][:,i],
#                      true_value=None,
#                      title = 'Posterior distribution for sigma(%s)'%(i)))
# layout = hv.Layout(plots)
# layout.cols(1)

In [20]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['K']):
    for j in range(data['K']):
            plots.append(plot_trace(ps['V'][:,i,j],
                     true_value=None,
                     title = 'Posterior distribution for V(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)