In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from codebase.plot import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv("../dat/muthen_women.csv")
df = df.replace(-9, np.nan).astype(float)
df.dropna(inplace=True)
df = df.astype(int)
df.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15
0,4,3,3,4,7,7,3,2,4,7,1,5,3,4,6
1,5,2,5,3,4,5,2,4,4,4,4,5,3,4,5
2,4,4,4,6,4,5,6,4,5,4,3,4,4,4,5
3,4,5,7,1,3,7,5,3,1,5,1,3,1,2,4
4,5,6,6,7,7,5,4,4,2,6,2,4,6,7,6


In [3]:
from sklearn import preprocessing

x = df.values #returns a numpy array
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

In [4]:
data = dict()
data['N'] = df.shape[0]
data['K'] = 5
data['J'] = df.shape[1]
data['y'] = df.values

In [5]:
num_chains = 1
num_samples = 1200
num_warmup = 1000
num_iter = num_samples + num_warmup

In [14]:
# log_dir = "./log/muthem_women_no-u/"
# # sm = load_obj('sm', log_dir)
# # fit = load_obj('fit', log_dir)
# ps = load_obj('ps', log_dir)



In [6]:
existing_directory = None
task_handle = 'muthen-women-no-u-exact-zeros-std'

if existing_directory is None:
    print("\n\nCreating directory")
    nowstr = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_') # ISO 8601 format
    log_dir =  "./log/"+nowstr+"%s/" % task_handle

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)



Creating directory


In [7]:
with open('./codebase/stan_code/cont/CFA/marg_m_nou2.stan', 'r') as file:
        model_code = file.read()
print("\n\nCompiling model")
sm = pystan.StanModel(model_code=model_code, verbose=False)
print("\n\nSaving compiled model in directory %s"%log_dir)
save_obj(sm, 'sm', log_dir)

stan_data = dict(N = data['N'], K = data['K'], J = data['J'], yy = data['y'])

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_9722d40fa83ef1f0a3cdfd894cdec683 NOW.




Compiling model


Saving compiled model in directory ./log/20190710_140217_muthen-women-no-u-exact-zeros-std/


In [8]:
print_model = True
if bool(print_model):
    print(model_code)


data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  matrix[N,J] yy;
}

transformed data{
  real<lower=0> c = 1;
  vector[J] zeros = rep_vector(0, J);
  cov_matrix[J] I_c = diag_matrix(rep_vector(c, J));
}

parameters {
  vector<lower=0>[J] sigma;
  vector<lower=0>[K] sigma_z;
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  cholesky_factor_corr[K] Phi_corr_chol;
}

transformed parameters{
  cov_matrix[J] Theta;
  matrix[J,K] beta;
  cov_matrix [K] Phi_cov ;
  cov_matrix[J] Marg_cov;

  Theta = diag_matrix(square(sigma));
  Phi_cov = multiply_lower_tri_self_transpose(diag_pre_multiply(sigma_z, Phi_corr_chol));

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }

  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;

  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  Marg_cov = beta * Phi_cov * beta'+ Theta;
}

model {
  to_vector(beta_free) ~ normal(0, 1);
  to_vector(alpha) ~ normal(0, 1);
  sig

In [9]:
print("\n\nFitting model.... \n\n")

fit_run = sm.sampling(data=stan_data,
    iter=num_samples + num_warmup,
    warmup=num_warmup, chains=num_chains)

print("\n\nSaving fitted model in directory %s"%log_dir)
save_obj(fit_run, 'fit', log_dir)






Fitting model.... 




Saving fitted model in directory ./log/20190710_140217_muthen-women-no-u-exact-zeros-std/


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [10]:
print("\n\nSaving posterior samples in %s"%log_dir)
param_names = ['Marg_cov', 'beta', 'Phi_cov', 'alpha', 'sigma', 'sigma_z']

stan_samples= fit_run.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else:
    ps = stan_samples
save_obj(ps, 'ps', log_dir)




Saving posterior samples in ./log/20190710_140217_muthen-women-no-u-exact-zeros-std/


In [11]:
mb = np.zeros((data['J'], data['K']))
mb[:3,0] = np.array([0.772, 0.575, 0.503 ])
mb[3:6,1] = np.array([0.704, 0.657, 0.548 ])
mb[6:9,2] = np.array([0.685, 0.702, 0.622])
mb[9:12,3] = np.array([0.791, 0.736, 0.695])
mb[12:,4] = np.array([0.780, 0.738, 0.660])


In [12]:
muthen_results = dict()
muthen_results['beta'] = mb


In [13]:
our_results=dict()
our_results['beta'] = np.average(ps['beta'], axis=0)

In [14]:
std_betas = np.empty_like(ps['beta'])
for j in range(num_samples):
    b = ps['beta'][j]
    si = np.diag(ps['sigma'][j])
    std_betas[j] = si**(.5) @ b 

In [17]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(std_betas[:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [18]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=muthen_results['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [19]:
# %%output info=True 
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        if j!=k:
            plots.append(plot_trace(ps['Phi_cov'][:,j,k],
            title = 'Posterior distribution for Phi(%s,%s)'%(j,k)).options(fig_inches=10, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
#     tight = True, tight_padding=10,
    vspace = .5,
    absolute_scaling=False, fig_size=100)
layout.cols(2)

In [20]:
mle_est = dict()
mle_est['Sigma'] = np.cov(data['y'], rowvar=False)
mle_est['mu'] = np.mean(data['y'], axis=0)
def compute_D1(yy):   
    return multivariate_normal.logpdf(yy, mean=mle_est['mu'], cov=mle_est['Sigma'])

In [36]:
mcmc_iter = 22
y_pred=multivariate_normal.rvs(mean= ps['alpha'][mcmc_iter],
        cov=ps['Marg_cov'][mcmc_iter],
       size = data['y'].shape[0])
compute_D(100,False)

array([ 1.97371193e+00,  1.66740201e+00,  1.75407498e+00, -1.76272138e+00,
       -1.54241480e+00,  2.32217009e+00,  8.91237406e-01,  1.21997306e+00,
        6.46390540e-01,  1.39872973e+00,  1.02846341e+00,  2.56436672e-01,
       -5.28216237e-01,  9.19779088e-01,  1.88362205e+00,  1.48175898e+00,
       -5.25147157e+00,  6.06264694e-01,  4.25119680e-01,  1.37480833e+00,
        8.30804113e-01,  6.62937997e-01,  3.56636675e-01, -2.01253533e-01,
        1.08871246e+00,  3.37852780e-01,  7.76401192e-01,  1.74693542e+00,
        4.99266601e-01,  1.72018725e+00,  2.16994407e+00,  2.83784139e+00,
        6.11840223e-01,  3.13025983e-01, -4.78882131e-01,  7.89262379e-01,
       -1.28274732e+00, -5.89116456e-01, -1.21746989e+00,  2.71366928e+00,
        6.98850131e-01,  1.33620163e+00,  7.30816114e-01,  1.02944137e+00,
        5.56900303e-01,  1.48373316e+00,  1.96398276e+00,  1.68157587e+00,
        5.52770053e-01,  8.63105634e-01, -1.64118630e-01,  1.67962388e+00,
        2.51082802e+00,  

In [47]:
def compute_D2(yy, mcmc_iter):    
    D2 = multivariate_normal.logpdf(yy,
                               mean= ps['alpha'][mcmc_iter],
                               cov = ps['Marg_cov'][mcmc_iter])
    return D2

def compute_D(mcmc_iter, pred):
    if pred == True:
        y_pred=multivariate_normal.rvs(mean= ps['alpha'][mcmc_iter],
                        cov=ps['Marg_cov'][mcmc_iter],
                       size = data['y'].shape[0])
        return compute_D1(y_pred) - compute_D2(y_pred, mcmc_iter)

    else:
        return compute_D1(data['y']) - compute_D2(data['y'], mcmc_iter)
    
    
compute_D(100, True).shape

(677,)

In [50]:
np.arange(0,mcmc_length, 10)

array([   0,   10,   20,   30,   40,   50,   60,   70,   80,   90,  100,
        110,  120,  130,  140,  150,  160,  170,  180,  190,  200,  210,
        220,  230,  240,  250,  260,  270,  280,  290,  300,  310,  320,
        330,  340,  350,  360,  370,  380,  390,  400,  410,  420,  430,
        440,  450,  460,  470,  480,  490,  500,  510,  520,  530,  540,
        550,  560,  570,  580,  590,  600,  610,  620,  630,  640,  650,
        660,  670,  680,  690,  700,  710,  720,  730,  740,  750,  760,
        770,  780,  790,  800,  810,  820,  830,  840,  850,  860,  870,
        880,  890,  900,  910,  920,  930,  940,  950,  960,  970,  980,
        990, 1000, 1010, 1020, 1030, 1040, 1050, 1060, 1070, 1080, 1090,
       1100, 1110, 1120, 1130, 1140, 1150, 1160, 1170, 1180, 1190])

In [62]:
mcmc_length = num_samples
iterations_to_use = np.arange(0,mcmc_length, 10)
iterations_to_use_length = iterations_to_use.shape[0]
Ds = np.empty((iterations_to_use_length,2))
for i in range(iterations_to_use_length):
    Ds[i,0] = -2*np.sum(compute_D(iterations_to_use[i], pred=False))
    Ds[i,1] = -2*np.sum(compute_D(iterations_to_use[i], pred=True))

In [64]:
Ds[:,0] < Ds[:,1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])