In [1]:
import numpy as np
import pandas as pd
import pystan
import datetime
import sys
import os
from scipy.stats import norm, multivariate_normal
from codebase.data import gen_data
from codebase.file_utils import save_obj, load_obj

In [8]:
existing_directory = None
task_handle = 'ppp-sim'
random_seed = 98

In [9]:

if existing_directory is None:
    print("\n\nCreating directory")
    nowstr = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_') # ISO 8601 format
    log_dir =  "./log/"+nowstr+"%s/" % task_handle

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    print("\n\nGenerating data")
    data = gen_data(500, 6, 2, random_seed = random_seed)
    print("\n\nN = %d, J= %d, K =%d"%(data['N'],data['J'], data['K'] ))
    print("\nSaving data\n\n")
    save_obj(data, 'data', log_dir)

    with open('./codebase/stan_code/cont/CFA/marg_simulation.stan', 'r') as file:
        model_code = file.read()
    print("\n\nCompiling model")
    sm = pystan.StanModel(model_code=model_code, verbose=False)
    print("\n\nSaving compiled model in directory %s"%log_dir)
    save_obj(sm, 'sm', log_dir)

else:
    log_dir = args.existing_directory
    if log_dir[-1] != "/":
        print("\n\nAppending `/`-character at the end of directory")
        log_dir = log_dir+ "/"
    print("\n\nLoading existing data from %s"%log_dir)
    data = load_obj('data', log_dir)
    print("\n\nReading existing compiled model from directory %s"%log_dir)
    sm = load_obj('sm', log_dir)


stan_data = dict(N = data['N'], K = data['K'], J = data['J'], yy = data['y'])

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_bd55769ba2d57dbaac4f60241e1cf869 NOW.




Creating directory


Generating data


N = 500, J= 6, K =2

Saving data




Compiling model


Saving compiled model in directory ./log/20190707_103018_ppp-sim/


In [10]:
print_model = True
num_warmup = 1200
num_samples = 1000
num_chains = 1 

In [11]:
if bool(print_model):
    print(model_code)

print("\n\nFitting model.... \n\n")

fit_run = sm.sampling(data=stan_data,
    iter=num_samples + num_warmup,
    warmup=num_warmup, chains=num_chains)

print("\n\nSaving fitted model in directory %s"%log_dir)
save_obj(fit_run, 'fit', log_dir)

print("\n\nSaving posterior samples in %s"%log_dir)
param_names = ['Omega_beta', 'beta', 'V_corr', 'V' , 'alpha', 'sigma', 'sigma_z', 'uu']

stan_samples= fit_run.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else:
    ps = stan_samples
save_obj(ps, 'ps', log_dir)


data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  matrix[N,J] yy;
}

transformed data{
  real<lower=0> c = 1;
  vector[J] zeros = rep_vector(0, J);
  cov_matrix[J] I_c = diag_matrix(rep_vector(c, J));
}

parameters {
  vector<lower=0>[J] sigma;
  vector<lower=0>[K] sigma_z;
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  matrix[J-3,K] beta_zeros; // 3 zero elements per factor
  cholesky_factor_corr[K] V_corr_chol;
  matrix[N,J] uu;
  cov_matrix[J] Sigma_u;
}

transformed parameters{
  cov_matrix[J] Sigma_epsilon;
  matrix[J,K] beta;
  cov_matrix [K] V ;
  cov_matrix[J] Omega;

  Sigma_epsilon = diag_matrix(square(sigma));
  V = multiply_lower_tri_self_transpose(diag_pre_multiply(sigma_z, V_corr_chol));

  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;

  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  // set the zero elements
  beta[4:J, 1] = beta_zeros[1:(J-3), 1];
  beta[1:(J-3), K] = beta

To run all diagnostics call pystan.check_hmc_diagnostics(fit)




Saving fitted model in directory ./log/20190707_103018_ppp-sim/


Saving posterior samples in ./log/20190707_103018_ppp-sim/


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


NameError: name 'args' is not defined

In [16]:
from codebase.plot import * 

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [18]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=data['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [20]:
# %%output info=True 
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        if j!=k:
            plots.append(plot_trace(ps['V_corr'][:,j,k],
                                    true_value = data['V_corr'][j,k],
            title = 'Posterior distribution for V_corr(%s,%s)'%(j,k)).options(fig_inches=10, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
#     tight = True, tight_padding=10,
    vspace = .5,
    absolute_scaling=False, fig_size=100)
layout.cols(2)