In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv("../dat/muthen_women.csv")
df = df.replace(-9, np.nan).astype(float)
df.dropna(inplace=True)
df = df.astype(int)
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,3,3,4,7,7,3,2,4,7,1,5,3,4,6
1,5,2,5,3,4,5,2,4,4,4,4,5,3,4,5
2,4,4,4,6,4,5,6,4,5,4,3,4,4,4,5
3,4,5,7,1,3,7,5,3,1,5,1,3,1,2,4
4,5,6,6,7,7,5,4,4,2,6,2,4,6,7,6


In [3]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [4]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [7]:
log_dir = "./log/muthem_women_no-u/"
# # sm = load_obj('sm', log_dir)
# # fit = load_obj('fit', log_dir)
# ps = load_obj('ps', log_dir)



In [5]:
existing_directory = None
task_handle = 'muthen-women-no-u'

if existing_directory is None:
    print("\n\nCreating directory")
    nowstr = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_') # ISO 8601 format
    log_dir =  "./log/"+nowstr+"%s/" % task_handle

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)



Creating directory


In [8]:
with open('./codebase/stan_code/cont/CFA/marg_m_nou.stan', 'r') as file:
        model_code = file.read()
print("\n\nCompiling model")
sm = pystan.StanModel(model_code=model_code, verbose=False)
print("\n\nSaving compiled model in directory %s"%log_dir)
save_obj(sm, 'sm', log_dir)

stan_data = dict(N = data['N'], K = data['K'], J = data['J'], yy = data['y'])

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3830fedd5a2bf8c20209d031a4a7f98d NOW.




Compiling model


Saving compiled model in directory ./log/20190708_110357_muthen-women-no-u/


In [9]:
print_model = True
if bool(print_model):
    print(model_code)


data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  matrix[N,J] yy;
}

transformed data{
  real<lower=0> c = 1;
  vector[J] zeros = rep_vector(0, J);
  cov_matrix[J] I_c = diag_matrix(rep_vector(c, J));
}

parameters {
  vector<lower=0>[J] sigma;
  vector<lower=0>[K] sigma_z;
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  cholesky_factor_corr[K] Phi_corr_chol;
}

transformed parameters{
  cov_matrix[J] Theta;
  matrix[J,K] beta;
  cov_matrix [K] Phi_cov ;
  cov_matrix[J] Marg_cov;

  Theta = diag_matrix(square(sigma));
  Phi_cov = multiply_lower_tri_self_transpose(diag_pre_multiply(sigma_z, Phi_corr_chol));

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }

  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;

  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  // set the zero elements
  beta[4:J, 1] = beta_zeros[1:(J-3), 1];
  for (k

In [10]:
print("\n\nFitting model.... \n\n")

fit_run = sm.sampling(data=stan_data,
    iter=num_samples + num_warmup,
    warmup=num_warmup, chains=num_chains)

print("\n\nSaving fitted model in directory %s"%log_dir)
save_obj(fit_run, 'fit', log_dir)




Fitting model.... 








Saving fitted model in directory ./log/20190708_110357_muthen-women-no-u/


Saving posterior samples in ./log/20190708_110357_muthen-women-no-u/


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


ValueError: No parameter Phi

In [11]:
print("\n\nSaving posterior samples in %s"%log_dir)
param_names = ['Marg_cov', 'beta', 'Phi_cov', 'alpha', 'sigma']

stan_samples= fit_run.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else:
    ps = stan_samples
save_obj(ps, 'ps', log_dir)




Saving posterior samples in ./log/20190708_110357_muthen-women-no-u/


In [12]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [13]:
muthen_results = dict()
muthen_results['beta'] = mb


In [14]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)

In [15]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [18]:
# %%output info=True 
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        if j!=k:
            plots.append(plot_trace(ps['Phi_cov'][:,j,k],
            title = 'Posterior distribution for Phi(%s,%s)'%(j,k)).options(fig_inches=10, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
#     tight = True, tight_padding=10,
    vspace = .5,
    absolute_scaling=False, fig_size=100)
layout.cols(2)

In [22]:
mle_est = dict()
mle_est['Sigma'] = np.cov(data['y'], rowvar=False)
mle_est['mu'] = np.mean(data['y'], axis=0)
def compute_D1(yy):   
    return multivariate_normal.logpdf(yy, mean=mle_est['mu'], cov=mle_est['Sigma'])

In [24]:
def compute_D2(yy, mcmc_iter):    
    D2 = multivariate_normal.logpdf(yy,
                               mean= ps['alpha'][mcmc_iter],
                               cov = ps['Marg_cov'][mcmc_iter])
    return D2

def compute_D(mcmc_iter, pred=True):
    if pred == True:
        y_pred=multivariate_normal.rvs(mean= ps['alpha'][mcmc_iter],
                        cov=ps['Marg_cov'][mcmc_iter],
                       size = data['y'].shape[0])
        return compute_D1(y_pred) - compute_D2(y_pred, mcmc_iter)

    else:
        return compute_D1(data['y']) - compute_D2(data['y'], mcmc_iter)
    
    
compute_D(100, False)

array([ 3.84649755e-01,  5.24387135e-01,  7.04612764e-01,  2.58697657e+00,
       -4.97992968e-01,  1.37286499e+00, -7.86442694e-02,  6.79697893e-01,
        2.62806396e-01,  1.08496388e+00,  3.43503770e-01,  3.43548484e-01,
        3.94092553e-01, -2.10010970e-01,  1.34446059e-01,  1.42939746e-01,
       -5.01666532e-01,  2.36796957e-02,  1.63980834e-01,  1.75548840e-01,
       -7.73404513e-02,  1.33936299e-01, -1.04122103e+00, -2.80247980e-02,
        8.71329902e-01,  3.19687778e-01,  3.15366554e-01,  8.69921949e-01,
        7.35551361e-01,  1.45681822e+00,  6.16266549e-01,  1.05777617e+00,
        4.73762063e-01,  1.43146144e-01, -3.87782776e-01,  3.86606636e-01,
        3.14846422e-01,  2.71405600e-01,  7.60629863e-01,  9.97407702e-01,
        4.29507046e-01,  1.41661539e-01,  3.22562799e-01,  1.80862696e-01,
       -1.00234204e-02,  1.24881287e+00,  8.19738953e-01,  6.32704478e-01,
        2.01720152e-01,  1.70768491e-01,  3.53514201e-01,  4.66935491e-01,
        1.44810030e+00,  

In [28]:
np.sum(compute_D(mcmc_iter, pred=True))

-184.8582159852987

In [25]:
mcmc_length = 1000
Ds = np.empty((mcmc_length,2))
for mcmc_iter in range(mcmc_length):
    Ds[mcmc_iter,0] = -2*np.sum(compute_D(mcmc_iter, pred=False))
    Ds[mcmc_iter,1] = -2*np.sum(compute_D(mcmc_iter, pred=True))

In [26]:
Ds[:,0] < Ds[:,1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,