In [1]:
import numpy as np
import pandas as pd
import pystan
from scipy.stats import norm, multivariate_normal, invwishart, invgamma, bernoulli
from scipy.special import expit
from statsmodels.tsa.stattools import acf


import datetime
import sys
import os

from codebase.plot import * 
from codebase.post_process import * 
from codebase.file_utils import save_obj, load_obj

%matplotlib inline

%load_ext autoreload
%autoreload 2

  from pandas.core import datetools


In [2]:
df = pd.read_csv("../dat/LSAT.csv")
df.head()

Unnamed: 0,x0,x1,x2,x3,x4
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,1
4,0,0,0,0,1


In [3]:
np.random.seed(121)
nsim_data = 100
yy = np.zeros((nsim_data, df.shape[1]))
DD = bernoulli.rvs(p=expit(yy)); DD


array([[0, 0, 0, 0, 1],
       [0, 1, 1, 0, 1],
       [1, 0, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [0, 0, 0, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 0, 1, 1, 0],
       [0, 0, 0, 1, 1],
       [1, 0, 1, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1],
       [1, 1, 1, 1, 1],
       [1, 0, 1, 1, 1],
       [1, 0, 0, 1, 0],
       [1, 1, 1, 1, 1],
       [0, 1, 0, 1, 1],
       [0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 1, 1, 1],
       [0, 0, 1, 0, 0],
       [1, 1, 1, 1, 1],
       [1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0],
       [0, 0, 1, 1, 1],
       [0, 0, 1, 1, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1],
       [1, 1, 1, 1, 0],
       [1, 1, 0, 0, 1],
       [0, 1, 1, 0, 1],
       [1, 1, 0, 0, 0],
       [1, 0, 1, 1, 1],
       [1, 0, 0, 0, 1],
       [0, 1, 1, 0, 0],
       [1, 0, 0, 1, 1],
       [1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0],
       [0, 0, 1,

In [4]:
data = dict()
data['N'] = df.shape[0]
data['J'] = df.shape[1]
data['K'] = 1
data['D'] = np.vstack([df[:-100].values, DD])

In [5]:
stan_data = dict(N = data['N'], K = data['K'], J = data['J'], DD = data['D'])


In [6]:
with open('./codebase/stan_code/model6_aug.stan', 'r') as file:
    model_code = file.read()


In [7]:
sm = pystan.StanModel(model_code=model_code, verbose=False)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_7af366de083ee8d4a8b307ab27daa81d NOW.


In [8]:
nowstr = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_') # ISO 8601 format
nowstr

'20190326_141909_'

In [9]:
task_id = 'LSAT1_exp1'
log_dir =  "./log/"+nowstr+"%s/" % task_id
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [10]:
num_chains = 1
num_samples = 1000
num_warmup = 1000
num_iter = num_samples + num_warmup

In [11]:
fit = sm.sampling(data=stan_data, iter=num_iter, chains=num_chains)


To run all diagnostics call pystan.check_hmc_diagnostics(fit)


In [12]:
save_obj(sm, 'sm', log_dir)
save_obj(fit, 'fit', log_dir)


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [13]:
# sm = load_obj('sm', './log/20190325_160644_LSAT1/')
# fit = load_obj('fit', './log/20190325_160644_LSAT1/')


In [14]:
param_names = ['Omega', 'uu', 'beta', 'mu']

stan_samples= fit.extract(permuted=False, pars=param_names)  # return a dictionary of arrays

if num_chains ==1:
    ps = dict()
    for name in param_names:
        ps[name] = np.squeeze(stan_samples[name])
else: 
    ps = stan_samples

In [31]:
# Check that u's cover zero
get_non_zeros(ps['uu'], 10, 90)

(array([], dtype=int64), array([], dtype=int64))

In [16]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in [80,950]:
    for j in range(2):
            plots.append(plot_trace(ps['uu'][:,i,j],
                     title = 'Posterior distribution for u(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [32]:
plot_density(ps['uu'][:,:900,0],
                     title = 'u1') * \
    plot_density(ps['uu'][:,901,0],
                     title = 'u2') * \
    plot_density(ps['uu'][:,902,0],
                     title = 'u3') 

In [26]:
a = np.mean(ps['uu'][:,:900,0]); a
b = np.mean(ps['uu'][:,901:,0]); b
print(a, b)

0.0012160226258571142 -0.010412617474843058


In [98]:
%%opts BoxWhisker {+axiswise} [width=1000, height=300, tools=['hover']] 

plots = []
for i in range(20):
    plots.append(hv.BoxWhisker((i, ps['uu'][:,i,0]), 'Index', 'Value',\
                               label='Standard Data').options(box_color='blue', xrotation=90))
for i in range(901,920):
    plots.append(hv.BoxWhisker((i, ps['uu'][:,i,0]), 'Index', 'Value',\
                               label='Distorted').options(box_color='red', xrotation=90))
layout = hv.Overlay(plots)
layout.cols(1)

In [19]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
    for j in range(data['J']):
            plots.append(plot_trace(ps['Omega'][:,i,j],
                     title = 'Posterior distribution for Omega(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [20]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
    for j in range(data['J']):
            plots.append(plot_trace(acf(ps['Omega'][:,i,j]),
                     title = 'Autocorrelation of Omega(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [21]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
            plots.append(plot_trace(ps['mu'][:,i],
                     title = 'Posterior distribution for mu(%s)'%(i)))
layout = hv.Layout(plots)
layout.cols(1)

In [22]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
            plots.append(plot_trace(acf(ps['mu'][:,i]),
                     title = 'Autocorrelation of mu(%s)'%(i)))
layout = hv.Layout(plots)
layout.cols(1)

In [23]:
gamma = np.empty((num_samples, data['J'], data['K']))

for i in range(num_samples):
    e_set = get_topn_eig(np.outer(ps['beta'][i], ps['beta'][i]), #ps['beta'][i] @ ps['beta'][i].T,
                         data['K'])
    gamma[i] = e_set['P']
#     lambdas[i] = e_set['L']
    
ps['gamma']= gamma
# ps['lambda'] = lambdas

In [24]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
    for j in range(data['K']):
            plots.append(plot_trace(ps['gamma'][:,i,j],
                     title = 'Posterior distribution for gamma(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)

In [25]:
%%opts Curve {+axiswise} [width=600, height=200, tools=['hover']] 
plots = []
for i in range(data['J']):
    for j in range(data['K']):
            plots.append(plot_trace(acf(ps['gamma'][:,i,j]),
                     title = 'Autocorrelation of Omega(%s,%s)'%(i,j)))
layout = hv.Layout(plots)
layout.cols(1)