In [2]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_csv("../dat/muthen_women.csv")
df = df.replace(-9, np.nan).astype(float)
df.dropna(inplace=True)
df = df.astype(int)
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,3,3,4,7,7,3,2,4,7,1,5,3,4,6
1,5,2,5,3,4,5,2,4,4,4,4,5,3,4,5
2,4,4,4,6,4,5,6,4,5,4,3,4,4,4,5
3,4,5,7,1,3,7,5,3,1,5,1,3,1,2,4
4,5,6,6,7,7,5,4,4,2,6,2,4,6,7,6


In [8]:
from sklearn import preprocessing

x = df.values #returns a numpy array
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

In [11]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [13]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [14]:
# log_dir = "./log/muthem_women_no-u/"
# # sm = load_obj('sm', log_dir)
# # fit = load_obj('fit', log_dir)
# ps = load_obj('ps', log_dir)



In [15]:
existing_directory = None
task_handle = 'muthen-women-no-u-std'

if existing_directory is None:
    print("\n\nCreating directory")
    nowstr = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_') # ISO 8601 format
    log_dir =  "./log/"+nowstr+"%s/" % task_handle

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)



Creating directory


In [16]:
with open('./codebase/stan_code/cont/CFA/marg_m_nou.stan', 'r') as file:
        model_code = file.read()
print("\n\nCompiling model")
sm = pystan.StanModel(model_code=model_code, verbose=False)
print("\n\nSaving compiled model in directory %s"%log_dir)
save_obj(sm, 'sm', log_dir)

stan_data = dict(N = data['N'], K = data['K'], J = data['J'], yy = data['y'])

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3830fedd5a2bf8c20209d031a4a7f98d NOW.




Compiling model


Saving compiled model in directory ./log/20190709_122313_muthen-women-no-u-std/


In [17]:
print_model = True
if bool(print_model):
    print(model_code)


data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  matrix[N,J] yy;
}

transformed data{
  real<lower=0> c = 1;
  vector[J] zeros = rep_vector(0, J);
  cov_matrix[J] I_c = diag_matrix(rep_vector(c, J));
}

parameters {
  vector<lower=0>[J] sigma;
  vector<lower=0>[K] sigma_z;
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  cholesky_factor_corr[K] Phi_corr_chol;
}

transformed parameters{
  cov_matrix[J] Theta;
  matrix[J,K] beta;
  cov_matrix [K] Phi_cov ;
  cov_matrix[J] Marg_cov;

  Theta = diag_matrix(square(sigma));
  Phi_cov = multiply_lower_tri_self_transpose(diag_pre_multiply(sigma_z, Phi_corr_chol));

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }

  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;

  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  // set the zero elements
  beta[4:J, 1] = beta_zeros[1:(J-3), 1];
  for (k

In [18]:
print("\n\nFitting model.... \n\n")

fit_run = sm.sampling(data=stan_data,
    iter=num_samples + num_warmup,
    warmup=num_warmup, chains=num_chains)

print("\n\nSaving fitted model in directory %s"%log_dir)
save_obj(fit_run, 'fit', log_dir)




Fitting model.... 








Saving fitted model in directory ./log/20190709_122313_muthen-women-no-u-std/


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [37]:
print("\n\nSaving posterior samples in %s"%log_dir)
param_names = ['Marg_cov', 'beta', 'Phi_cov', 'alpha', 'sigma', 'sigma_z']

stan_samples= fit_run.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else:
    ps = stan_samples
save_obj(ps, 'ps', log_dir)




Saving posterior samples in ./log/20190709_122313_muthen-women-no-u-std/


In [20]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [21]:
muthen_results = dict()
muthen_results['beta'] = mb


In [22]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)

In [55]:
std_betas = np.empty_like(ps['beta'])
for j in range(num_samples):
    b = ps['beta'][j]
    si = np.diag(ps['sigma'][j])
    std_betas[j] = si**(.5) @ b 

In [56]:
ps['sigma'][j].shape

(15,)

In [57]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(std_betas[:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [23]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [24]:
# %%output info=True 
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        if j!=k:
            plots.append(plot_trace(ps['Phi_cov'][:,j,k],
            title = 'Posterior distribution for Phi(%s,%s)'%(j,k)).options(fig_inches=10, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
#     tight = True, tight_padding=10,
    vspace = .5,
    absolute_scaling=False, fig_size=100)
layout.cols(2)

In [25]:
mle_est = dict()
mle_est['Sigma'] = np.cov(data['y'], rowvar=False)
mle_est['mu'] = np.mean(data['y'], axis=0)
def compute_D1(yy):   
    return multivariate_normal.logpdf(yy, mean=mle_est['mu'], cov=mle_est['Sigma'])

In [33]:
compute_D1(data['y'])

array([-28.04547917, -17.20985675, -15.28774196, -30.11112972,
       -20.67372316, -21.73869155, -17.63511756, -18.29145674,
       -15.9531816 , -18.15009415, -17.28485007, -18.78594561,
       -18.74188307, -20.15929467, -22.97869331, -18.74583268,
       -25.15730069, -17.71298573, -16.06124594, -33.05238922,
       -17.55918133, -17.25931337, -30.60952725, -19.52895158,
       -21.09775898, -16.39599191, -15.24440594, -19.73970287,
       -16.82219879, -26.23419998, -19.27499497, -22.75104918,
       -15.83475007, -17.98326347, -16.24930056, -15.41259953,
       -22.88969271, -15.21182072, -25.51009495, -16.22629119,
       -14.64006202, -18.67199255, -18.21109023, -15.91599694,
       -16.47397918, -20.74068006, -18.83739789, -20.78176556,
       -13.31995639, -18.51142932, -18.4967459 , -18.54812532,
       -18.16063739, -16.06380685, -23.45394397, -16.40997299,
       -21.17687093, -15.30513748, -16.28594219, -14.87737513,
       -15.78578406, -13.46137797, -16.86314987, -21.20

In [30]:
def compute_D2(yy, mcmc_iter):    
    D2 = multivariate_normal.logpdf(yy,
                               mean= ps['alpha'][mcmc_iter],
                               cov = ps['Marg_cov'][mcmc_iter])
    return D2

def compute_D(mcmc_iter, pred=True):
    if pred == True:
        y_pred=multivariate_normal.rvs(mean= ps['alpha'][mcmc_iter],
                        cov=ps['Marg_cov'][mcmc_iter],
                       size = data['y'].shape[0])
        return compute_D1(y_pred) - compute_D2(y_pred, mcmc_iter)

    else:
        return compute_D1(data['y']) - compute_D2(data['y'], mcmc_iter)
    
    
compute_D2(100, True)

array([-4.54261143e-01,  1.03668454e-01, -5.13913775e-01, -2.78696687e-01,
        3.07934116e-01, -1.38714085e+00,  3.77094722e-01,  5.33146592e-03,
        2.86188250e-01, -2.35683121e+00, -1.06952114e-01,  6.30274681e-02,
        1.32413074e+00, -5.49981112e-02, -3.92836913e-01,  2.43410890e-01,
        1.62632185e-01,  2.82006410e-01, -3.92819335e-01, -1.39313476e-01,
       -3.57245581e-01,  5.63681940e-01, -1.77820416e-01, -8.49482692e-01,
       -5.13793052e-01, -1.57015295e+00, -2.07264267e+00, -4.90484867e-01,
        1.12040200e+00,  9.20657744e-01, -8.08856468e-01,  2.34960342e-01,
       -1.10034301e-01, -2.69467521e-02,  2.77851322e-01,  5.20551403e-01,
       -8.32253274e-02,  1.08929442e+00,  8.42342247e-01,  3.50603468e-01,
       -2.69788264e-01, -4.61868205e-01,  7.21224006e-01,  7.04421223e-02,
       -9.92592540e-02,  1.31283290e-01,  1.52543116e-01, -2.08559052e-01,
       -1.94053854e-01, -7.04690355e-01,  2.03462359e-01,  9.44904623e-02,
        3.27252899e-01,  

In [32]:
y_pred=multivariate_normal.rvs(mean= ps['alpha'][mcmc_iter],
                    cov=ps['Marg_cov'][mcmc_iter],
                   size = data['y'].shape[0])
compute_D2(y_pred, mcmc_iter)

array([-20.70674728, -20.38807757, -21.72764511, -19.92713057,
       -22.06924335, -26.66111093, -22.13759396, -19.84543564,
       -28.40328056, -18.13035236, -17.38104117, -19.09341707,
       -20.84005491, -19.29149103, -19.58424319, -23.86137033,
       -17.07320488, -17.44646914, -16.95959206, -16.20699329,
       -17.68652556, -18.8977809 , -23.39582514, -17.95604103,
       -18.11777331, -19.81171999, -20.98484751, -24.00272235,
       -18.85865136, -18.17576883, -21.66146088, -20.6680214 ,
       -17.27399589, -23.78593181, -22.0067608 , -23.2259973 ,
       -23.52835375, -21.22538996, -18.22971278, -23.51537646,
       -15.72239901, -19.86908585, -18.52913116, -19.86866821,
       -17.36357148, -16.75280567, -18.89500643, -21.09609634,
       -22.59497554, -20.83486554, -18.70853365, -24.1410758 ,
       -21.08059048, -17.6233595 , -23.82466244, -18.18684142,
       -21.91824488, -22.88004148, -18.49107492, -19.65258535,
       -22.52519676, -18.20337039, -23.34616021, -20.47

In [28]:
mcmc_length = 1000
Ds = np.empty((mcmc_length,2))
for mcmc_iter in range(mcmc_length):
    Ds[mcmc_iter,0] = -2*np.sum(compute_D(mcmc_iter, pred=False))
    Ds[mcmc_iter,1] = -2*np.sum(compute_D(mcmc_iter, pred=True))

In [29]:
Ds[:,0] < Ds[:,1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,