# Estimates with `bayesmix` package

In [None]:
# Import the library
from bayesmixpy import run_mcmc, build_bayesmix

import arviz as az
from contextlib import redirect_stdout
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.special import logsumexp

import os
os.environ["BAYESMIX_EXE"] = "../../build/run_mcmc"

In [None]:
# Build the library
build_bayesmix(4)

In [None]:
# Create logs and CSV folders
log_fold = 'log'
csv_fold = 'csv'
png_fold = 'png'
os.makedirs(log_fold, exist_ok=True)
os.makedirs(csv_fold, exist_ok=True)
os.makedirs(png_fold, exist_ok=True)

In [None]:
# Write bayesmix algorithm names and config files
algorithms = "Neal2 Neal3 Neal8 SplitMerge".split()
algo_settings = """
algo_id: "{}"
rng_seed: 20201124
iterations: 5000
burnin: 1000
init_num_clusters: 3
neal8_n_aux: 3
splitmerge_n_restr_gs_updates: 5
splitmerge_n_mh_updates: 1
splitmerge_n_full_gs_updates: 1
"""
py_prior = """
fixed_values {
  strength: 1.0
  discount: 0.1
}
"""

In [None]:
# Initialize containers
datasets = ['galaxy', 'faithful', 'highdim']
bnpmix_algos = ('MAR', 'ICS')

bayesmix_densities = {}
bayesmix_num_clust = {}
for data in datasets:
    bayesmix_densities[data] = {}
    bayesmix_num_clust[data] = {}

## `galaxy` dataset

In [None]:
# Read data
data_folder = '../../resources/datasets/'
galaxy = np.loadtxt(data_folder + 'galaxy.csv', delimiter=',')

In [None]:
# Write NGG hierarchy prior config file
g0_galaxy = """
ngg_prior {
  mean_prior {
    mean: 25.0
    var: 4.0
  }
  var_scaling_prior {
    shape: 0.4
    rate: 0.2
  }
  shape: 4.0
  scale_prior {
    shape: 4.0
    rate: 2.0
  }
}
"""

In [None]:
# Run algorithms
for algo in algorithms:
    log_file = os.path.join(log_fold, 'bayesmix_galaxy_{}.log'.format(algo))
    with open(log_file, 'w') as f:
        with redirect_stdout(f):
            out = run_mcmc("NNIG", "PY", galaxy, g0_galaxy, py_prior,
                           algo_settings.format(algo), galaxy,
                           return_num_clusters=True,  # out [1]
                           return_clusters=False, return_best_clus=False)
    bayesmix_densities['galaxy'][algo] = out[0]
    bayesmix_num_clust['galaxy'][algo] = out[1]

## `faithful` dataset

In [None]:
# Read data
data_folder = '../../resources/datasets/'
faithful = np.loadtxt(data_folder + 'faithful.csv', delimiter=',')

In [None]:
# Write NGIW hierarchy prior config file
g0_faithful = """
ngiw_prior {
  mean_prior {
    mean {
      size: 2
      data: 3.0
      data: 3.0
    }
    var {
      rows: 2
      cols: 2
      data: 0.25
      data: 0.0
      data: 0.0
      data: 0.25
    }
  }
  var_scaling_prior {
    shape: 0.4
    rate: 0.2
  }
  deg_free: 4.0
  scale_prior {
    deg_free: 4.0
    scale {
      rows: 2
      cols: 2
      data: 4.0
      data: 0.0
      data: 0.0
      data: 4.0
    }
  }
}
"""

In [None]:
# Run algorithms
for algo in algorithms:
    log_file = os.path.join(log_fold, 'bayesmix_faithful_{}.log'.format(algo))
    with open(log_file, 'w') as f:
        with redirect_stdout(f):
            out = run_mcmc("NNW", "PY", faithful, g0_faithful, py_prior,
                           algo_settings.format(algo), faithful,
                           return_num_clusters=True,  # out [1]
                           return_clusters=False, return_best_clus=False)
    bayesmix_densities['faithful'][algo] = out[0]
    bayesmix_num_clust['faithful'][algo] = out[1]

## Synthesized `highdim` dataset (dimension 4)
Note: this requires that the `generate_high_dim_data.ipynb` notebook has been run beforehand.

In [None]:
# Read data
highdim = np.loadtxt(csv_fold + '/highdim.csv', delimiter=',')

In [None]:
# Write NGIW hierarchy prior config file
g0_highdim = """
ngiw_prior {
  mean_prior {
    mean {
      size: 4
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.0
    }
    var {
      rows: 4
      cols: 4
      data: 0.1
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.1
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.1
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.1
    }
  }
  var_scaling_prior {
    shape: 0.2
    rate: 2.0
  }
  deg_free: 10.0
  scale_prior {
    deg_free: 10.0
    scale {
      rows: 4
      cols: 4
      data: 0.1
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.1
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.1
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.0
      data: 0.1
    }
  }
}
"""

In [None]:
# Run algorithms
for algo in algorithms:
    log_file = os.path.join(log_fold, 'bayesmix_highdim_{}.log'.format(algo))
    with open(log_file, 'w') as f:
        with redirect_stdout(f):
            out = run_mcmc("NNW", "PY", highdim, g0_highdim, py_prior,
                           algo_settings.format(algo), highdim,
                           return_num_clusters=True,  # out [1]
                           return_clusters=False, return_best_clus=False)
    bayesmix_densities['highdim'][algo] = out[0]
    bayesmix_num_clust['highdim'][algo] = out[1]

# Comparison: BNPmix vs bayesmix
Note: this requires that the `run_bnpmix.ipynb` R notebook has been run beforehand.

## Effective Sample Sizes (ESS) comparison

In [None]:
# Initialize DataFrame to collect ESS
ESS = pd.DataFrame(columns=datasets)

In [None]:
# Collect BNPmix ESS
for data in datasets:
    for algo in bnpmix_algos:
        csv_file = os.path.join(csv_fold, 'bnpmix_{}_nclu_{}.csv'.format(data,
                                                                         algo))
        n_clust = np.genfromtxt(csv_file)
        ESS.at['bnpmix_'+algo, data] = az.ess(n_clust)

In [None]:
# Collect bayesmix ESS
for data in datasets:
    for algo in algorithms:
        ESS.at['bayesmix_'+algo, data] = az.ess(bayesmix_num_clust[data][algo])

## Time comparisons

In [None]:
# Initialize DataFrame to collect times
times = pd.DataFrame(columns=datasets)

In [None]:
# Parse BNPmix execution times
for data in datasets:
    for algo in bnpmix_algos:
        log_file = os.path.join(log_fold, 'bnpmix_{}_{}.log'.format(data,
                                                                    algo))
        with open(log_file, 'r') as f:
            for line in f:
                if "Estimation done in " in line:
                    time = line.split()[3]
                    times.at['bnpmix_'+algo, data] = float(time)
                    break

In [None]:
# Parse bayesmix execution times
for data in datasets:
    for algo in algorithms:
        log_file = os.path.join(log_fold, 'bayesmix_{}_{}.log'.format(data,
                                                                      algo))
        with open(log_file, 'r') as f:
            for line in f:
                if "100%" in line and "Done" in line:
                    time = line.split()[2].rstrip("s")
                    times.at['bayesmix_'+algo, data] = float(time)
                    break

## ESS-time ratio

In [None]:
# Compute ratios
ratios = pd.DataFrame()
for col in ESS.columns:
    ratios[col] = ESS[col] / times[col]

## Display full metrics table

In [None]:
metric_names = 'ESS times ratios'.split()
for data in datasets:
    df_all = pd.DataFrame(index=ESS.index, columns=metric_names)
    for metric in metric_names:
        df = globals()[metric]
        df_all[metric] = np.round(df[data].astype(float), 3)
    print(data, ":\n", df_all, "\n", sep="")

## Autocorrelation of number of clusters

In [None]:
# Autocorrelation plots for BNPmix
size = 20
for data in datasets:
    for algo in bnpmix_algos:
        csv_file = os.path.join(csv_fold, 'bnpmix_{}_nclu_{}.csv'.format(data,
                                                                         algo))
        n_clust = np.genfromtxt(csv_file)
        ax = az.plot_autocorr(n_clust)
        ax.set_xlabel("lag", size=size)
        ax.set_ylabel("autocorrelation", size=size)
        ax.set_title("BNPmix {} {}".format(data,algo), size=size)
        plt.savefig(
            os.path.join(png_fold, 'bnpmix_{}_{}.png'.format(data, algo)),
            dpi=300, bbox_inches='tight')

In [None]:
# Autocorrelation plots for bayesmix
size = 20
for data in datasets:
    for algo in algorithms:
        ax = az.plot_autocorr(bayesmix_num_clust[data][algo])
        ax.set_xlabel("lag", size=size)
        ax.set_ylabel("autocorrelation", size=size)
        ax.set_title("bayesmix {} {}".format(data,algo), size=size)
        plt.savefig(
            os.path.join(png_fold, 'bayesmix_{}_{}.png'.format(data, algo)),
            dpi=300, bbox_inches='tight')