In [None]:
import numpy as np
import pandas as pd
import scipy.linalg as linalg
import matplotlib.pyplot as plt
import math
import pickle as pkl
import time
%matplotlib inline

In [None]:
import sys
sys.path.insert(0, '..')
from dhmc.dhmc_sampler import DHMCSampler

### Load SECOM data

In [None]:
secom = pd.read_table('./secom_features.txt', sep='\s+', header=None)
y = pd.read_table('./secom_outcome.txt', sep='\s+', header=None)[0]

In [None]:
# Remove predictors with too many na's
max_na_pred = 20
index_many_na = np.where(secom.isnull().sum(axis=0) > max_na_pred)[0]
secom = secom.drop(index_many_na, axis=1) 
print('{:d} features were dropped.'.format(index_many_na.size))

# Remove incomplete cases
index_drop = np.where(secom.isnull().any(axis = 1))[0]
secom = secom.drop(index_drop, axis=0)
y = y.drop(index_drop)

In [None]:
X = secom.as_matrix()
print('Removing additional {:d} features for identifiability.'.format(np.sum(np.var(X, 0) == 0)))
X = X[:, np.var(X, 0) > 0]
X = (X - np.mean(X, 0)) / np.std(X, 0)
X = np.hstack((np.ones((X.shape[0], 1)), X)) # Intercept
y = y.as_matrix().astype('float')

n_param = X.shape[1]
n_disc = n_param # No smooth conditional densities.

### Load the (cleaned) SECOM data.

In [None]:
y = np.load('secom_outcome.npy')
X = np.load('secom_design_matrix.npy') # With intercept.

### Define functions to compute the posterior.

A function to compute the log posterior density and its gradient.  

In [None]:
# Computes the log posterior density and its gradient. 
def f(theta, req_grad=True):
    
    beta = theta
    logp = 0
    grad = np.zeros(len(y))
    
    # Contribution from the prior.
    logp += - np.sum(beta ** 2) / 2
    
    # Contribution from the likelihood.
    y_hat = np.dot(X, beta)
    loglik = np.count_nonzero(y * y_hat > 0)
    logp += loglik
    
    aux = (loglik, y_hat)
    return logp, np.zeros(len(theta)), aux

A function to compute the difference in the log conditional density for a given parameter index.

In [None]:
def f_update(beta, dbeta, index, aux):
    
    j = index
    loglik_prev, y_hat = aux
    y_hat = y_hat + X[:,j] * dbeta
    
    logp_diff = (beta[j] ** 2 - (beta[j] + dbeta) ** 2) / 2
    
    # Contribution from the likelihood.
    loglik = np.count_nonzero(y * y_hat > 0)
    logp_diff += loglik - loglik_prev
    
    aux_new = (loglik, y_hat)
    return logp_diff, aux_new

#### Initial state for MCMC.

In [None]:
intercept0 = np.log(np.mean(y == 1) / (1 - np.mean(y == 1)))
beta0 = np.zeros(X.shape[1])
beta0[0] = intercept0
theta0 = beta0

#### Test the gradient and updating function.

In [None]:
scale = np.ones(n_param)
dhmc = DHMCSampler(f, f_update, n_disc, n_param, scale)
dhmc.test_cont_grad(theta0, sd=.01, n_test=10);
_, theta, logp_fdiff, logp_diff = \
    dhmc.test_update(theta0, sd=10, n_test=100)

In [None]:
logp, _, aux = f(theta0)
%timeit f(theta0)
%timeit f_update(theta0, .1, 1, aux)

In [None]:
%load_ext line_profiler
%lprun -f f f(theta0)

In [None]:
%timeit f(theta0, opt=True)
%timeit f(theta0, opt=False)

### Run samplers

In [None]:
Phi = np.dot(X.T, X)
eig_val = np.linalg.eigvalsh(Phi)
plt.plot(np.log10(eig_val))
plt.show()

In [None]:
%%prun
seed = 1
n_burnin = 10 ** 2
n_sample = 10 ** 3
dt = .8 * np.array([.7, 1]) 
nstep = [10, 15] # [60, 75]
samples, logp_samples, accept_prob, nfevals_per_itr, time_elapsed = \
    dhmc.run_sampler(theta0, dt, nstep, n_burnin, n_sample, seed=seed)
    
samples = samples[n_burnin:, :]
logp_samples = logp_samples[n_burnin:]
dhmc_samples = samples.copy()

In [None]:
thin = 10

mcmc_output = {
    'samples': samples[::thin, :],
    'logp': logp_samples,
    'accept_prob': accept_prob,
    'nfevals_per_itr': nfevals_per_itr,
    'time': time_elapsed,
    'n_burnin': n_burnin,
    'seed': seed,
    'theta0': theta0,
    'thin': thin,
    'emp_cov': np.cov(samples.T)
}

filename = 'pac_bayes_dhmc_output.pkl'
with open(filename, 'wb') as file:
    pkl.dump(mcmc_output, file)

### Run Metroplis.

In [None]:
from adaptive_metropolis import adap_RWMH, RWMH

In [None]:
def f_logp(theta):
    logp, _, _ = f(theta, req_grad=False, opt=True)
    return logp

In [None]:
%%prun
n_warmup = 0 # 10 ** 4
n_sample = 2 * 10 ** 2
thin = 100
seed = 1

filename = 'pac_bayes_dhmc_output.pkl'
with open(filename, 'rb') as file:
    mcmc_output = pkl.load(file)
theta0_rwmh = mcmc_output['samples'][-1, :]
Sigma = mcmc_output['emp_cov']
stepsize = 2.38 / math.sqrt(n_param)

# Run MH with a fixed covariance.
samples, accept_rate, stepsize_seq, time_elapsed = \
    RWMH(f_logp, theta0_rwmh, stepsize, n_warmup, n_sample, Sigma, seed, thin)

### Run Metropolis-within-Gibbs

In [None]:
filename = 'pac_bayes_dhmc_output.pkl'
with open(filename, 'rb') as file:
    mcmc_output = pkl.load(file)
theta0_rwmh = mcmc_output['samples'][-1, :]
Sigma = mcmc_output['emp_cov']
cond_sd = np.diag(np.linalg.inv(Sigma)) ** -.5

In [None]:
def adap_metropolis_gibbs(theta, prop_sd, aux, n_adap, n_per_adap=10):
    accept_rate = np.zeros((n_adap, n_param))
    for i in range(n_adap):
        adapt_rate = (i + 1) ** -1
        theta, prop_sd, accept_rate[i,:], aux \
            = adap_metropolis_gibbs_step(theta, prop_sd, aux, adapt_rate, n_per_adap)
    return theta, accept_rate, prop_sd, aux  

def adap_metropolis_gibbs_step(theta, prop_sd, aux, adapt_rate, n_per_adap):
    accept_prob = np.zeros((n_per_adap, n_param))
    for i in range(n_per_adap):
        theta, accept_prob[i,:], aux \
            = metropolis_gibbs_step(theta, prop_sd, aux)
    accept_rate = np.mean(accept_prob, 0)
    prop_sd *= np.exp(adapt_rate * (accept_rate - .441))
    return theta, prop_sd, accept_rate, aux

def metropolis_gibbs_step(theta, prop_sd, aux):
    accept_prob = np.zeros(n_param)
    for index in range(n_param):
        theta, accept_prob[index], aux = \
            cond_metropolis_update(theta, index, prop_sd, aux)
    return theta, accept_prob, aux

def cond_metropolis_update(theta, index, prop_sd, aux):
    # Sample from the conditional distribution imitating the optimal
    # Metropolis proposal standard deviation.
    dtheta = prop_sd[index] * np.random.randn()
    logp_diff, aux_new = f_update(theta, dtheta, index, aux)
    accept_prob = min(1, math.exp(logp_diff))
    if accept_prob > np.random.uniform():
        theta[index] += dtheta
        aux = aux_new
    return theta, accept_prob, aux

In [None]:
n_adap = 10 ** 3
prop_sd = 2.40 * cond_sd
_, _, aux = f(theta)
theta, accept_rate, prop_sd, aux \
    = adap_metropolis_gibbs(theta, prop_sd, aux, n_adap, n_per_adap=10)

In [None]:
%%prun
seed = 1
n_burnin = 0
n_sample = 2 * 10 ** 2

np.random.seed(seed)
theta = theta0.copy()
_, _, aux = f(theta)
for i in range(n_burnin):
    theta, _, aux = metropolis_gibbs_step(theta, prop_sd, aux)
    
samples = np.zeros((n_sample, n_param))
accept_prob = np.zeros((n_sample, n_param))
samples[0, :] = theta
for i in range(1, n_sample):
    samples[i, :], accept_prob[i, :], aux \
        = metropolis_gibbs_step(samples[i - 1, :], prop_sd, aux)
accept_rate = np.mean(accept_prob[1:,:], 0)

In [None]:
plt.plot(accept_rate)
plt.show()

### Examine the posterior.

In [None]:
filename = 'pac_bayes_dhmc_output.pkl'
with open(filename, 'rb') as file:
    mcmc_output = pkl.load(file)
samples = mcmc_output['samples']

#### Check the summary statistics as well as their mixing.

In [None]:
y_hat = np.mean(np.dot(samples, X.T), 0)
agreement = (y * y_hat > 0)
np.mean(agreement[y == 1]), np.mean(agreement[y == -1])

In [None]:
plt.plot(samples.mean(axis=0)[1:])
plt.show()

In [None]:
Phi = np.dot(X.T, X)
d, V = np.linalg.eigh(Phi)
plt.plot(np.dot(samples, V[:, -10:]))
plt.show()

In [None]:
ess_mono = mono_seq_ess(samples, normed=True)
plot_index = np.argsort(ess_mono)[:10]
plt.plot(samples[:, plot_index])
plt.show()

In [None]:
plt.plot(mcmc_output['logp'])
plt.show()

In [None]:
ess_mono = mono_seq_ess(samples, normed=True)
plt.plot(np.log10(ess_mono))
# plt.ylim(0, 1)
plt.show()

#### Take a look at the posterior covariance structure.

In [None]:
plt.plot(np.sqrt(np.linalg.eigvalsh(np.cov(samples.T))))

In [None]:
var = np.diag(Sigma)
corr_mat = np.dot(np.diag(var ** -.5), np.dot(Sigma, np.diag(var ** -.5)))
plt.figure(figsize=(16, 16))
plt.imshow(corr_mat, cmap='coolwarm')
plt.clim(-1, 1)
plt.colorbar()
plt.show()

#### Plot a posterior conditional.

In [None]:
index = 1 + np.argmax(np.abs(beta_hat[1:]))

In [None]:
plt.hist(samples[:, index], bins=25)
plt.show()

In [None]:
index = 0 # Index for plotting
_, _, aux = f(beta_hat)
resol = 251
support = np.std(samples[:, index]) * np.array([-1, 1]) 
grid = np.linspace(support[0], support[1], resol)
logp = np.array([f_update(beta_hat, dbeta, index, aux)[0] for dbeta in grid])
density = np.exp(logp - np.max(logp))
density /= np.sum(density * (grid[1] - grid[0]))

plt.figure(figsize=(14, 4.5))
plt.subplot(1, 2, 1)
plt.plot(beta_hat[index] + grid, density)
plt.subplot(1, 2, 2)
plt.plot(beta_hat[index] + grid, -logp)
plt.show()

In [None]:
_, _, aux = f(beta_hat)
resol = 251
support = np.std(samples[:, index]) * np.array([-5, 5])
grid = beta_hat[index] + np.linspace(support[0], support[1], resol)
def f_shifted(beta, dbeta, index):
    beta = beta.copy()
    beta[index] += dbeta
    return f(beta)
logp = np.array([f_shifted(beta_hat, dbeta, index)[0] for dbeta in grid])
density = np.exp(logp - np.max(logp))
plt.plot(grid, logp)
plt.show()