# Blocked Gibbs algorithm + logit SB mixing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.model_selection import train_test_split
from google.protobuf.internal.decoder import _DecodeVarint32
import sys
sys.path.insert(0, '..')
from proto.py.algorithm_state_pb2 import AlgorithmState

# Utility to save files with Unix-like newlines
def save_np(filename, npobj):
    with open(filename, 'wb') as f:
        np.savetxt(f, npobj, fmt='%1.5f')

# Sigmoid function
def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

# Utility to read file collector, courtesy of
# github.com/mberaha/utils/blob/master/proto_utils/py/recordio.py
def readManyFromFile(filename, msgType):
    out = []
    with open(filename, "rb") as fp:
        buf = fp.read()
    n = 0
    while n < len(buf):
        msg_len, new_pos = _DecodeVarint32(buf, n)
        n = new_pos
        msg_buf = buf[n:n+msg_len]
        try:
            msg = msgType()
            msg.ParseFromString(msg_buf)
            out.append(msg)
            n += msg_len
        except Exception as e:
            break
    return out

## Test 1

Initialize parameters:

In [None]:
dim1 = 2
n_clus1 = 3
x_centers1 = np.column_stack(([-5, 0], [5, 0], [0, 5]))
x_var1 = (1.5)**2
y_centers1 = [4*_ for _ in range(n_clus1)]
y_var1 = 1
weights1 = [0.2, 0.2, 0.6]

Generate data:

In [None]:
n1 = 200
np.random.seed(20201124)
# Allocations
cc1 = []
for c in range(n_clus1):
    nn = int(weights1[c] * n1)
    cc1.extend(nn * [c])
# Covariates
xx1 = np.zeros((n1, dim1))
for i in range(n1):
    x = np.random.multivariate_normal(mean=x_centers1[:,cc1[i]],
        cov=x_var1*np.identity(dim1))
    xx1[i,:] = x
# Data points
yy1 = np.zeros(n1)
for i in range(n1):
    y = np.random.normal(loc=y_centers1[cc1[i]], scale=y_var1)
    yy1[i] = y

Split into train and test sets:

In [None]:
np.random.seed(20201125)
perc_train1 = 0.75
# Generate booleans
n_train1 = int(perc_train1 * n1)
idxs1 = np.arange(n1)
training_mask1 = np.zeros(n1, dtype=bool)
train_idxs1 = np.random.choice(idxs1, size=n_train1, replace=False)
training_mask1[train_idxs1] = True
# Train and test sets
xx_train1 = xx1[training_mask1]
yy_train1 = yy1[training_mask1]
xx_test1 = xx1[~training_mask1]
yy_test1 = yy1[~training_mask1]

Save to file:

In [None]:
save_np("../resources/csv/in/logsb_cov_mix_1.csv", xx_train1)
save_np("../resources/csv/in/logsb_data_1.csv", yy_train1)
save_np("../resources/csv/in/logsb_grid_cov_mix_1.csv", np.matrix(xx_test1[0]))
save_np("../resources/csv/in/logsb_grid_data_1.csv", yy_test1)

Run the executable:

In [None]:
cmd = ('../build/run ../algo_settings.asciipb '
    'NNIG ../resources/asciipb/nnig_ngg_prior.asciipb '
    'LogSB ../resources/asciipb/lsb_normal_prior.asciipb test1.recordio '
    '../resources/csv/in/logsb_data_1.csv ../resources/csv/in/logsb_grid_data_1.csv '
    '../resources/csv/out/logsb_dens_1.csv ../resources/csv/out/logsb_nclu_1.csv '
    '../resources/csv/out/logsb_clus_1.csv "" "" '
    '../resources/csv/in/logsb_cov_mix_1.csv '
    '../resources/csv/in/logsb_grid_cov_mix_1.csv').split()
subprocess.run(cmd, capture_output=True)

## Test 2

Initialize parameters:

In [None]:
dim2 = 2
n_clus2 = 3
alphas2 = np.identity(n_clus2-1)
x_mean2 = np.array(dim2*[1])
x_var2 = (1.5)**2
y_centers2 = [4*_ for _ in range(n_clus2)]
y_var2 = 1

Generate data:

In [None]:
np.random.seed(20201126)
n2 = 200
# Covariates
xx2 = np.random.multivariate_normal(mean=x_mean2,
    cov=x_var2*np.identity(dim2), size=n2)
# Data points
yy2 = np.zeros(n2)
cc2 = np.zeros(n2, dtype=int)
for i in range(n2):
    ## Generate nu_h(xi)
    nu = np.zeros(n_clus2-1)
    for h in range(n_clus2-1):
        nu[h] = sigmoid(np.dot(alphas2[:, h], xx2[i]))
    ## Generate weights
    weights = np.zeros(n_clus2)
    for h in range(n_clus2-1):
        weights[h] = nu[h]
        for k in range(h):
            weights[h] *= 1-nu[k]
    weights[n_clus2-1] = 1 - weights[:n_clus2-1].sum()
    ## Choose cluster and generate yi
    c = np.random.choice(n_clus2, p=weights)
    cc2[i] = c
    yy2[i] = np.random.normal(loc=y_centers2[c], scale=y_var2)

Split into train and test sets:

In [None]:
np.random.seed(20201127)
perc_train2 = 0.75
# Generate booleans
n_train2 = int(0.75 * n2)
idxs2 = np.arange(n2)
training_mask2 = np.zeros(n2, dtype=bool)
train_idxs2 = np.random.choice(idxs2, size=n_train2, replace=False)
training_mask2[train_idxs2] = True
# Train and test sets
xx_train2 = xx2[training_mask2]
yy_train2 = yy2[training_mask2]
xx_test2 = xx2[~training_mask2]
yy_test2 = yy2[~training_mask2]

Save to file:

In [None]:
save_np("../resources/csv/in/logsb_cov_mix_2.csv", xx_train2)
save_np("../resources/csv/in/logsb_data_2.csv", yy_train2)
save_np("../resources/csv/in/logsb_grid_cov_mix_2.csv", np.matrix(xx_test2[0]))
save_np("../resources/csv/in/logsb_grid_data_2.csv", yy_test2)

Run the executable:

In [None]:
cmd = ('../build/run ../algo_settings.asciipb '
    'NNIG ../resources/asciipb/nnig_ngg_prior.asciipb '
    'LogSB ../resources/asciipb/lsb_normal_prior.asciipb test2.recordio '
    '../resources/csv/in/logsb_data_2.csv ../resources/csv/in/logsb_grid_data_2.csv '
    '../resources/csv/out/logsb_dens_2.csv ../resources/csv/out/logsb_nclu_2.csv '
    '../resources/csv/out/logsb_clus_2.csv "" "" '
    '../resources/csv/in/logsb_cov_mix_2.csv '
    '../resources/csv/in/logsb_grid_cov_mix_2.csv').split()
subprocess.run(cmd, capture_output=True)

## Plot densities on test sets

Read density matrices:

In [None]:
matr1 = np.genfromtxt("../resources/csv/out/logsb_dens_1.csv", delimiter=',')
matr2 = np.genfromtxt("../resources/csv/out/logsb_dens_2.csv", delimiter=',')

Plot:

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(16,6))

ax1.scatter(yy_test1, np.exp(np.mean(matr1, axis=0)))
ax1.set_title(f"Test 1 density")

ax2.scatter(yy_test2, np.exp(np.mean(matr2, axis=0)))
ax2.set_title(f"Test 2 density")

## Compare clustering on train sets

Read posterior clusterings:

In [None]:
clust1 = np.genfromtxt("../resources/csv/out/logsb_clus_1.csv",
    delimiter=',').astype(int)
clust2 = np.genfromtxt("../resources/csv/out/logsb_clus_2.csv",
    delimiter=',').astype(int)

Compute Adjusted Rand Indexes by comparison with true clustering:

In [None]:
true_clust1 = [cc1[i] for i in train_idxs1]
true_clust2 = [cc2[i] for i in train_idxs2]
ari1 = adjusted_rand_score(clust1, true_clust1)
ari2 = adjusted_rand_score(clust2, true_clust2)
print(ari1)
print(ari2)

## File collectors

Read chain from file collectors:

In [None]:
chain1 = readManyFromFile("../test1.recordio", AlgorithmState)
chain2 = readManyFromFile("../test2.recordio", AlgorithmState)

For test 1:

In [None]:
for i in range(0, len(chain1), 100):
    means = []
    for j in range(len(chain1[i].cluster_states)):
        means.append(chain1[i].cluster_states[j].uni_ls_state.mean)
    print(means)

For test 2:

In [None]:
for i in range(0, len(chain2), 50):
    means = []
    for j in range(len(chain2[i].cluster_states)):
        means.append(chain2[i].cluster_states[j].uni_ls_state.mean)
    print(means)