In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from statsmodels.tsa.stattools import acf
import datetime
import sys
import os

from tqdm.notebook import tqdm
from codebase.plot import * 
from codebase.data import * 
from codebase.file_utils import save_obj, load_obj

from modelresultsbinary import *
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
num_chains = 1
num_samples = 1000
num_warmup = 1000
num_iter = num_samples + num_warmup

In [3]:
log_dir = "./log/bin_sim_2factor/20191218_192648_bin2fsim0m1/"
data = load_obj('data', log_dir)
ps = load_obj('ps', log_dir)
ps.keys()

dict_keys(['beta', 'alpha', 'zz', 'Phi_cov', 'yy'])

In [4]:
np.round(np.mean(ps['beta'],0),2)

array([[1.  , 0.  ],
       [1.03, 0.  ],
       [1.04, 0.  ],
       [0.  , 1.  ],
       [0.  , 0.97],
       [0.  , 1.66]])

In [5]:
print(open(log_dir+'model.txt').read())

data {
  int<lower=1> N;
  int<lower=1> K;
  int<lower=1> J;
  int<lower=0, upper=1> DD[N, J];
}

transformed data{
  vector[K] zeros_K = rep_vector(0, K);
  cov_matrix[K] I_K = diag_matrix(rep_vector(1, K));
}

parameters {
  vector[J] alpha;
  matrix[2,K] beta_free; // 2 free eleements per factor
  cov_matrix [K] Phi_cov;
  matrix[N,K] zz;
}

transformed parameters{
  matrix[J,K] beta;
  matrix[N,J] yy;

  for(j in 1:J) {
    for (k in 1:K) beta[j,k] = 0;
  }
  
  // set ones
  for (k in 1:K) beta[1+3*(k-1), k] = 1;
  // set the free elements
  for (k in 1:K) beta[2+3*(k-1) : 3+3*(k-1), k] = beta_free[1:2,k];

  for (n in 1:N) yy[n,] = to_row_vector(alpha) + zz[n,] * beta';
}
  
model {
  to_vector(beta_free) ~ normal(0, 1);
  to_vector(alpha) ~ normal(0, 10);
  Phi_cov ~ inv_wishart(J+4, I_K);
  for (n in 1:N) to_vector(zz[n,]) ~ multi_normal(zeros_K, Phi_cov);
  for (j in 1:J) DD[, j] ~ bernoulli_logit(yy[, j]);
}



In [20]:
piavg = get_avg_probs(data, ps, 0); piavg
a = bernoulli.pmf(k=to_nparray_data('000000'), p = piavg)
data['N'] * np.prod(np.mean(a,0))
# data_ptrn = to_str_pattern(data['D'])
# get_Ey(data_ptrn, piavg, data['N'])
# get_Oy(data_ptrn)


14.384899896494776

In [22]:
nsim_N = 100
PPP_vals, Dy, Dystr = get_PPP(data, ps, nsim_N)


HBox(children=(IntProgress(value=0), HTML(value='')))




In [23]:
np.sum(PPP_vals[:,0]<PPP_vals[:,1])/100

0.49

In [40]:
PPP_vals

array([[46.24040459, 43.56531956],
       [47.20398258, 42.15950731],
       [47.34874835, 48.95267567],
       [46.71120249, 55.42549867],
       [46.51517744, 28.94442427],
       [46.37464064, 47.8031552 ],
       [47.40106609, 43.44838765],
       [46.76040232, 49.38523909],
       [47.34331449, 49.16118705],
       [47.88149749, 46.25791591],
       [46.71281275, 62.26376136],
       [49.03370194, 65.82416735],
       [49.47332408, 52.02047501],
       [47.67363075, 41.65922408],
       [47.69936941, 50.26606575],
       [49.0623365 , 61.20354034],
       [47.11150085, 46.28663174],
       [46.84561457, 55.75317054],
       [45.65033209, 45.34526645],
       [46.33605798, 65.78872285],
       [46.20296572, 50.67365824],
       [45.7638606 , 58.00953312],
       [48.51129372, 59.43284397],
       [46.69771864, 50.54783114],
       [46.73373188, 35.25370078],
       [46.27652534, 38.55449864],
       [46.091068  , 41.97649655],
       [45.51711523, 47.1566883 ],
       [48.61065417,

In [77]:
data = gen_data_binary_1factor(10000, c=.001, random_seed=86)
data_ptrn = to_str_pattern(data['D'])
data_ptrn.value_counts()

011111    186
010101    184
000001    183
111110    182
100011    179
100010    175
101111    173
111000    172
001001    171
101000    171
011011    171
001100    170
000111    170
100100    169
100000    168
110010    164
101001    164
101010    163
111011    162
001101    162
101110    162
110001    160
011100    159
111001    159
101100    159
001111    158
011110    157
110011    156
100110    156
111100    156
         ... 
001110    154
101101    154
001010    153
110000    153
010010    153
010000    152
010011    152
010111    151
001011    151
011010    150
000100    150
000101    149
110111    149
100001    147
101011    147
000000    146
011101    146
111111    144
011000    143
110100    143
001000    142
000010    142
110101    142
010001    141
010110    140
010100    139
111101    137
100101    134
100111    129
000110    127
Length: 64, dtype: int64

In [74]:
np.mean(data['D'], 0)

array([0.501 , 0.4959, 0.4998, 0.4928, 0.4946, 0.4921])

In [73]:
np.mean(expit(data['y']), 0)

array([0.49596576, 0.49704002, 0.49393729, 0.4959057 , 0.49336905,
       0.49482536])

In [84]:
data = gen_data_binary(10000, c =.0001, random_seed=8976)
data_ptrn = to_str_pattern(data['D'])
data_ptrn.value_counts()

011001    186
010110    178
011011    173
100111    171
101100    170
010111    170
110011    170
110001    170
000011    168
000111    168
000110    168
111000    166
010011    165
011110    164
110110    164
011111    164
010100    164
100011    162
100001    162
110000    162
101110    162
011010    161
000001    159
101001    159
011000    158
000101    158
111100    158
111001    158
001100    158
101000    157
         ... 
000000    154
010000    154
001111    154
001000    154
001010    153
111111    153
000010    153
001011    153
110111    153
111110    153
100100    152
100000    152
000100    151
101101    151
001001    151
010010    150
100101    149
101111    148
010101    147
111011    146
101010    146
100110    143
001101    141
010001    141
101011    141
111010    140
110100    139
110010    138
011101    135
111101    132
Length: 64, dtype: int64

In [46]:
m = 0
piavg = get_avg_probs(data, ps, m)
data_ptrn = to_str_pattern(data['D'])
# ppdata = get_prob_pred_data(data, ps, m)
# data_ptrn = to_str_pattern(ppdata)
# get_Ey(data_ptrn, piavg, data['N'])
get_Oy(data_ptrn)


{'000000': 31,
 '000001': 13,
 '000010': 21,
 '000011': 21,
 '000100': 12,
 '000101': 14,
 '000110': 15,
 '000111': 20,
 '001000': 21,
 '001001': 7,
 '001010': 7,
 '001011': 13,
 '001100': 7,
 '001101': 20,
 '001110': 12,
 '001111': 17,
 '010000': 17,
 '010001': 21,
 '010010': 15,
 '010011': 11,
 '010100': 11,
 '010101': 8,
 '010110': 12,
 '010111': 21,
 '011000': 15,
 '011001': 9,
 '011010': 12,
 '011011': 11,
 '011100': 13,
 '011101': 11,
 '011110': 10,
 '011111': 21,
 '100000': 20,
 '100001': 15,
 '100010': 15,
 '100011': 9,
 '100100': 16,
 '100101': 13,
 '100110': 10,
 '100111': 19,
 '101000': 18,
 '101001': 14,
 '101010': 15,
 '101011': 12,
 '101100': 16,
 '101101': 14,
 '101110': 15,
 '101111': 18,
 '110000': 17,
 '110001': 17,
 '110010': 16,
 '110011': 19,
 '110100': 16,
 '110101': 18,
 '110110': 15,
 '110111': 17,
 '111000': 23,
 '111001': 12,
 '111010': 18,
 '111011': 17,
 '111100': 20,
 '111101': 22,
 '111110': 14,
 '111111': 31}

In [95]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['beta'][:,j,k],
             true_value=data['beta'][j,k],
             title = 'Posterior distribution for beta(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [96]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['K']):
    for k in range(data['K']):
        plots.append(plot_trace(ps['Phi_cov'][:,j,k],
             true_value=data['Phi_cov'][j,k],
             title = 'Posterior distribution for Phi_cov(%s,%s)'%(j,k)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [17]:
%%opts Layout [fig_size=200]
plots = []
for j in range(data['J']):
        plots.append(plot_trace(ps['alpha'][:,j],
             true_value=data['alpha'][j],
             title = 'Posterior distribution for alpha(%s)'%(j)).\
                     options(fig_inches=8, aspect=3))
layout = hv.Layout(plots).options(show_title = True,
                                  vspace = .3,
                                  absolute_scaling=False,
                                  normalize=False) # use same y-range for all plots?

layout.cols(2)


In [None]:
mu = np.mean(ps['uu'],0)
plt.figure(figsize=(6,10))
ax = sns.heatmap(mu, cbar = True )

In [None]:
print(np.round(np.mean(mu,0)*1e4, 2))
hv.Bars(np.mean(abs(mu),0)).options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)

In [None]:
print("Avg 0-900 = %.2f \nAvg 901-1000 = %.2f"%(np.mean(mu[:900])*1e3,np.mean(mu[900:])*1e3))

In [None]:
mu1 = np.mean(ps1['uu'],0)
plt.figure(figsize=(6,10))
ax = sns.heatmap(mu1, cbar = True )

In [None]:
print(np.round(np.mean(mu1,0)*1e4, 2))
hv.Bars(np.mean(abs(mu1),0)).options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)

In [None]:
print("Avg 0-900 = %.2f \nAvg 901-1000 = %.2f"%(np.mean(mu[:900])*1e3,np.mean(mu[900:])*1e3))

In [None]:
np.mean(mu[:900],0)*1e3

In [None]:
np.mean(mu[900:],0)*1e3

In [None]:
data['y'][900:, 3:]

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
from scipy.stats import expon
x = np.linspace(expon.ppf(0.01),
                expon.ppf(0.99), 100)
ax.plot(x, expon.pdf(x),
       'r-', lw=5, alpha=0.6, label='expon pdf')

In [None]:
x = expon.rvs(loc = 0, scale = 1/5, size = 1000)
np.percentile(x, q=[0,95])

## Residual Analysis

In [None]:
# %%opts Bars {+axiswise} [width=1000, height=300, ] 
res = pd.DataFrame(np.mean(abs(np.mean(ps1['uu'], 0)),1))
res.columns = ['avg_u']
res.reset_index(inplace=True)
res.sort_values('avg_u', ascending=False, inplace=True)
hv.Bars(res[:20],).options(color='blue', xrotation=90).options(fig_inches=8, aspect=3)
