# Performing tests

In [None]:
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from google.protobuf.internal.decoder import _DecodeVarint32
import sys
sys.path.insert(0, '..')
from proto.py.algorithm_state_pb2 import AlgorithmState
import arviz as az

# Define paths
CSVIN =  '../resources/csv/in/thesis'
CSVOUT = '../resources/csv/out/thesis'
RECORDIO = '../resources/recordio'
# Utility to read file collector, courtesy of
# github.com/mberaha/utils/blob/master/proto_utils/py/recordio.py
def readManyFromFile(filename, msgType):
    out = []
    with open(filename, "rb") as fp:
        buf = fp.read()
    n = 0
    while n < len(buf):
        msg_len, new_pos = _DecodeVarint32(buf, n)
        n = new_pos
        msg_buf = buf[n:n+msg_len]
        try:
            msg = msgType()
            msg.ParseFromString(msg_buf)
            out.append(msg)
            n += msg_len
        except Exception as e:
            break
    return out

## ```galaxy``` dataset

In [None]:
# Run the test
cmd = '../bash/thesis.sh galaxy'.split()
subprocess.run(cmd, capture_output=True)

In [None]:
# Read densities and clusterings
algos = 'Neal2 Neal3 Neal8'.split()
mixs = 'DP PY'.split()
galaxy_dens = dict()
galaxy_clus = dict()
for mix in mixs:
    galaxy_dens[mix] = dict()
    galaxy_clus[mix] = dict()
    for algo in algos:
        fdens = f"{CSVOUT}/galaxy_dens_{algo}_{mix}.csv"
        fclus = f"{CSVOUT}/galaxy_clus_{algo}_{mix}.csv"
        galaxy_dens[mix][algo] = np.genfromtxt(fdens, delimiter=',')
        galaxy_clus[mix][algo] = np.genfromtxt(fclus, delimiter=',')

In [None]:
# Read data and grid
galaxy_data = np.genfromtxt(f"{CSVIN}/galaxy.csv")
galaxy_grid = np.genfromtxt(f"{CSVIN}/galaxy_grid.csv")

In [None]:
# Plot densities
fig, axes = plt.subplots(1, 2, figsize=(18,6))
for m, mix in enumerate(mixs):
    for algo in algos:
        # Mean uses 1-in-2 thinning
        dens = np.exp(np.mean(galaxy_dens[mix][algo][0::2], axis=0))
        axes[m].plot(galaxy_grid, dens)
    axes[m].hist(galaxy_data, density=True, color='lightgray')
    axes[m].set_title(f"galaxy densities with {mix}")
    axes[m].legend(algos)

In [None]:
# Plot clusterings
fig, axes = plt.subplots(1, 2, figsize=(18,4))
for m, mix in enumerate(mixs):
    for a, algo in enumerate(algos):
        clus = galaxy_clus[mix][algo]
        axes[m].scatter(galaxy_data, a*np.ones_like(clus),
                        c=clus, cmap='hsv')
    axes[m].set_ylim(-1, 3)
    axes[m].set_yticks(list(range(3)))
    axes[m].set_yticklabels(algos)
    axes[m].set_title(f"galaxy clusterings with {mix}")

In [None]:
# Compute Effective Sample Sizes
ESS = dict()
for mix in mixs:
    ESS[mix] = dict()
    for algo in algos:
        # Read chain
        chain = readManyFromFile(f"{RECORDIO}/galaxy_{algo}_{mix}.recordio",
                                 AlgorithmState)
        # Record number of clusters at each iteration
        n_clust = np.array([len(ch.cluster_states) for ch in chain])
        # Compute ESS
        ESS[mix][algo] = az.ess(n_clust)
# Show ESS
ESS

## ```faithful``` dataset

In [None]:
# Run the test
cmd = '../bash/thesis.sh faithful'.split()
subprocess.run(cmd, capture_output=True)

In [None]:
# Read densities and clusterings
algos = 'Neal2 Neal3 Neal8'.split()
mixs = 'DP PY'.split()
faithful_dens = dict()
faithful_clus = dict()
for mix in mixs:
    faithful_dens[mix] = dict()
    faithful_clus[mix] = dict()
    for algo in algos:
        fdens = f"{CSVOUT}/faithful_dens_{algo}_{mix}.csv"
        fclus = f"{CSVOUT}/faithful_clus_{algo}_{mix}.csv"
        faithful_dens[mix][algo] = np.genfromtxt(fdens, delimiter=',')
        faithful_clus[mix][algo] = np.genfromtxt(fclus, delimiter=',')

In [None]:
# Read data and grid
faithful_data = np.genfromtxt(f"{CSVIN}/faithful.csv")
faithful_grid = np.genfromtxt(f"{CSVIN}/faithful_grid.csv")

In [None]:
# Plot densities (3D)
fig = plt.figure(figsize=(15,20))
idx = 0
for algo in algos:
    for mix in mixs:
        idx += 1
        # Mean uses 1-in-2 thinning
        dens = np.exp(np.mean(faithful_dens[mix][algo][0::2], axis=0))
        ax = fig.add_subplot(3, 2, idx, projection='3d')
        ax.scatter(faithful_grid[:,0], faithful_grid[:,1], dens, marker='.')
        ax.set_title(f"faithful densities with {mix} and {algo}")

In [None]:
# Plot densities (contours)
fig = plt.figure(figsize=(15,20))
idx = 0
for algo in algos:
    for mix in mixs:
        idx += 1
        # Mean uses 1-in-2 thinning
        dens = np.exp(np.mean(faithful_dens[mix][algo][0::2], axis=0)).reshape(-1,1)
        plot_data = pd.DataFrame(np.hstack([faithful_grid, dens]), 
                                 columns=["x", "y", "z"])
        Z = plot_data.pivot_table(index='x', columns='y', values='z').T.values
        X_unique = np.sort(plot_data.x.unique())
        Y_unique = np.sort(plot_data.y.unique())
        X, Y = np.meshgrid(X_unique, Y_unique)
        ax = fig.add_subplot(3, 2, idx)
        if Z.any():
            ax.contour(X, Y, Z)
        ax.set_title(f"faithful densities with {mix} and {algo}")

In [None]:
# Plot clusterings
fig = plt.figure(figsize=(14,14))
idx = 0
for algo in algos:
    for mix in mixs:
        idx += 1
        ax = fig.add_subplot(3, 2, idx)
        clus = faithful_clus[mix][algo]
        ax.scatter(faithful_data[:,0], faithful_data[:,1], c=clus)
        ax.set_title(f"faithful clusterings with {mix} and {algo}")

## ```dde``` dataset

In [None]:
# Run the test
cmd = '../bash/thesis.sh dde'.split()
subprocess.run(cmd, capture_output=True)

In [None]:
# Read densities and clusterings
algos = 'BlockedGibbs'.split()
mixs = 'LogSB TruncSB'.split()
dde_dens = dict()
dde_clus = dict()
for mix in mixs:
    dde_dens[mix] = dict()
    dde_clus[mix] = dict()
    for algo in algos:
        fdens = f"{CSVOUT}/dde_dens_{algo}_{mix}.csv"
        fclus = f"{CSVOUT}/dde_clus_{algo}_{mix}.csv"
        dde_dens[mix][algo] = np.genfromtxt(fdens, delimiter=',')
        dde_clus[mix][algo] = np.genfromtxt(fclus, delimiter=',')

In [None]:
# Read data and grid
dde_data = np.genfromtxt(f"{CSVIN}/dde.csv")
dde_grid = np.genfromtxt(f"{CSVIN}/dde_grid.csv")
dde_covs_grid = np.genfromtxt(f"{CSVIN}/dde_covs_grid.csv")

In [None]:
# Plot densities
fig, ax = plt.subplots(figsize=(10,5))
for mix in mixs:
    # Mean uses 1-in-2 thinning
    dens = np.exp(np.mean(dde_dens[mix]['BlockedGibbs'][0::2], axis=0))
    ax.plot(dde_grid, dens)
    ax.hist(dde_data, density=True, color='lightgray')
    ax.set_title(f"dde densities with {'BlockedGibbs'} for x={dde_covs_grid}")
    ax.legend(mixs)