# Blocked Gibbs algorithm + logit SB mixing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.model_selection import train_test_split

# Utility to save files with Unix-like newlines
def save_np(filename, npobj):
    with open(filename, 'wb') as f:
        np.savetxt(f, npobj, fmt='%1.5f')

# Sigmoid function
def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

## Test 1

Initialize parameters:

In [None]:
dim = 2
n_clus = 3
x_centers = np.column_stack(([-5, 0], [5, 0], [0, 5]))
x_var = (1.5)**2
y_centers = [4*_ for _ in range(dim+1)]
y_var = 1
weights = [0.2, 0.2, 0.6]

Generate data:

In [None]:
n = 200
np.random.seed(20201124)
# Allocations
cc = []
for c in range(n_clus):
    nn = int(weights[c] * n)
    cc.extend(nn * [c])
# Covariates
xx = np.zeros((n, dim))
for i in range(n):
    x = np.random.multivariate_normal(mean=x_centers[:,cc[i]],
                                      cov=x_var*np.identity(dim))
    xx[i,:] = x
# Data points
yy = np.zeros(n)
for i in range(n):
    y = np.random.normal(loc=y_centers[cc[i]], scale=y_var)
    yy[i] = y

Split into train and test sets:

In [None]:
np.random.seed(20201124)
perc_test = 0.25
xx_train, xx_test, yy_train, yy_test = train_test_split(xx, yy,
    test_size=perc_test, shuffle=False)
# Save to file
save_np("../resources/csv/in/logsb_cov_mix_1.csv", xx_train)
save_np("../resources/csv/in/logsb_data_1.csv", yy_train)
save_np("../resources/csv/in/logsb_grid_cov_mix_1.csv", xx_test)
save_np("../resources/csv/in/logsb_grid_data_1.csv", yy_test)

Run the executable:

In [None]:
cmd = ('../build/run ../algo_settings.asciipb '
    'NNIG ../resources/asciipb/nnig_ngg_prior.asciipb '
    'LogSB ../resources/asciipb/lsb_normal_prior.asciipb "" '
    '../resources/csv/in/logsb_data_1.csv ../resources/csv/in/logsb_grid_data_1.csv '
    '../resources/csv/out/logsb_dens_1.csv ../resources/csv/out/logsb_nclu_1.csv '
    '../resources/csv/out/logsb_clus_1.csv "" "" '
    '../resources/csv/in/logsb_cov_mix_1.csv '
    '../resources/csv/in/logsb_grid_cov_mix_1.csv').split()
subprocess.run(cmd, capture_output=True)

## Test 2

Initialize parameters:

In [None]:
dim2 = 3
n_clus2 = 4
alphas = np.identity(n_clus2-1)
x_mean = np.array(dim2*[1])
x_var2 = (1.5)**2
y_centers2 = [4*_ for _ in range(dim2+1)]
y_var2 = 1

Generate data:

In [None]:
np.random.seed(20201124)
n2 = 200
# Covariates
xx2 = np.random.multivariate_normal(mean=x_mean,
    cov=x_var2*np.identity(dim2), size=n2)
# Data points
yy2 = np.zeros(n2)
for i in range(n2):
    ## Generate nu_h(xi)
    nu = np.zeros(n_clus2-1)
    for h in range(n_clus2-1):
        nu[h] = sigmoid(np.dot(alphas[:, h], xx2[i]))
    ## Generate weights
    weights = np.zeros(n_clus2)
    for h in range(n_clus2-1):
        weights[h] = nu[h]
        for k in range(h):
            weights[h] *= 1-nu[k]
    weights[n_clus2-1] = 1 - weights[:n_clus2-1].sum()
    ## Choose cluster and generate yi
    c = np.random.choice(n_clus2, p=weights)
    yy2[i] = np.random.normal(loc=y_centers2[c], scale=y_var2)

Split into train and test sets:

In [None]:
np.random.seed(20201124)
perc_test2 = 0.25
xx_train2, xx_test2, yy_train2, yy_test2 = train_test_split(xx2, yy2,
    test_size=perc_test2, shuffle=False)
# Save to file
save_np("../resources/csv/in/logsb_cov_mix_2.csv", xx_train2)
save_np("../resources/csv/in/logsb_data_2.csv", yy_train2)
save_np("../resources/csv/in/logsb_grid_cov_mix_2.csv", xx_test2)
save_np("../resources/csv/in/logsb_grid_data_2.csv", yy_test2)

Run the executable:

In [None]:
cmd = ('../build/run ../algo_settings.asciipb '
    'NNIG ../resources/asciipb/nnig_ngg_prior.asciipb '
    'LogSB ../resources/asciipb/lsb_normal_prior.asciipb "" '
    '../resources/csv/in/logsb_data_2.csv ../resources/csv/in/logsb_grid_data_2.csv '
    '../resources/csv/out/logsb_dens_2.csv ../resources/csv/out/logsb_nclu_2.csv '
    '../resources/csv/out/logsb_clus_2.csv "" "" '
    '../resources/csv/in/logsb_cov_mix_2.csv '
    '../resources/csv/in/logsb_grid_cov_mix_2.csv').split()
subprocess.run(cmd, capture_output=True)

## Plot density

In [None]:
# matr = np.genfromtxt("../resources/csv/out/logsb_dens.csv", delimiter=',')

In [None]:
plt.plot(yy_grid, np.exp(matr[5,:]))
plt.plot(yy_grid, np.exp(matr[29,:]))
plt.plot(yy_grid, np.exp(matr[400,:]))
plt.plot(yy_grid, np.exp(np.mean(matr, axis=0)), linestyle='--')
plt.title("Densities")

## Rand scores

In [None]:
# adjusted_rand_score(cl1.astype(int), cl2.astype(int))