In [1]:
import numpy as np
import pandas as pd
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

from modelresultsbinary import *
%matplotlib inline

%load_ext autoreload
%autoreload 2

## Data descritpion

In this notebook we collect experiments run on the real dataset FND (Fagerstrom Nicotine Dependence). There are 6 questions (J=6) that are binary (either originally or made to be). The factor structure assumed is two factors (K=2) where the first factor loads to questions 1,2,3 and the second factor loads to 4,5,6.    


The model structure and related literature of the dataset and other factor analyses papers written on the dataset is described in *"A confirmatory factor analysis of the Fagerstrom Test for Nicotine Dependence"* by
Chris G. Richardsona, Pamela A. Ratnerb


In [21]:
log_dir = "./log/20200416_143921_m7_s1m7//"
data = load_obj('data', log_dir)
ps = load_obj('ps', log_dir)
ps.keys()


dict_keys(['alpha', 'yy', 'beta', 'Omega_cov'])

In [22]:
data

{'D': array([[1, 0, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [1, 0, 0, 1, 1, 1],
        ...,
        [1, 1, 1, 1, 0, 0],
        [1, 0, 0, 1, 0, 1],
        [1, 0, 1, 1, 0, 0]]), 'N': 565, 'J': 6, 'K': 2}

In [23]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  vector[J] zeros_J = rep_vector(0, J);
  cov_matrix[J] I_J = diag_matrix(rep_vector(1, J));
}

parameters {
  vector[J] alpha;
  vector[J-1] beta_free;
  vector[N] zz;
  matrix[N,J] uu;
  cov_matrix[J] Omega_cov;
}

transformed parameters{
  matrix[N,J] yy;
  vector[J] beta;
  // set ones
  beta[1] = 1;
  // set the free elements
  beta[2 : J] = beta_free[1:(J-1)];
  for (n in 1:N) yy[n,] = to_row_vector(alpha) + to_row_vector(zz[n]*beta) + uu[n,];
}

model {
  to_vector(beta_free) ~ normal(0, 1);
  to_vector(alpha) ~ normal(0, 10);
  to_vector(zz) ~ normal(0, 1);
  Omega_cov ~ inv_wishart(J+6, I_J);
  for (n in 1:N) uu[n,] ~ multi_normal(zeros_J, Omega_cov);
  for (j in 1:J) DD[, j] ~ bernoulli_logit(yy[, j]);
}



In [24]:
num_chains = 1
num_samples = ps['alpha'].shape[0]
num_warmup = ps['alpha'].shape[0]
num_iter = num_samples + num_warmup

In [25]:

for i in range(num_samples):
    sign1 = np.sign(ps['beta'][i,0])
    ps['beta'][i,1:] = ps['beta'][i,1:] * sign1
    
#     ps['Phi_cov'][i,0,1] = sign1 * sign2 * ps['Phi_cov'][i,0,1]
#     ps['Phi_cov'][i,1,0] = ps['Phi_cov'][i,0,1]
    

In [26]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    plots.append(plot_trace(ps['beta'][:,j],
                            true_value=0,
         title = 'Posterior distribution for beta(%s)'%(j)).\
                 options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [27]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
        plots.append(plot_trace(ps['alpha'][:,j],
             title = 'Posterior distribution for alpha(%s)'%(j)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [28]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['Phi_cov'][:,j,k],
             title = 'Posterior distribution for Phi_cov(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=True) # use same y-range for all plots?

layout.cols(2)


KeyError: 'Phi_cov'

In [29]:
nsim = 200
PPP_vals, Dy, Dystr = get_PPP(data, ps, nsim)
100*np.sum(PPP_vals[:,0]<PPP_vals[:,1])/nsim

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




37.5