In [None]:
import numpy as np
import matplotlib.pyplot as plt

from pybmix.core.mixing import DirichletProcessMixing
from pybmix.core.hierarchy import UnivariateNormal
from pybmix.core.mixture_model import MixtureModel

generate data

In [None]:
y = np.concatenate(
    [np.random.normal(loc=3, size=100), np.random.normal(loc=-3, size=100)])
plt.hist(y)
plt.show()

define the mixture model

In [None]:
mixing = DirichletProcessMixing(total_mass=2)
hierarchy = UnivariateNormal()
hierarchy.make_default_fixed_params(y, 2)
mixture = MixtureModel(mixing, hierarchy)

run mcmc

In [None]:
mixture.run_mcmc(y, niter=2000, nburn=1000)

get the density estimates: fix a grid where to estimate the densities; the method 'estimate_density' returns a matrix of shape [niter - nburn, len(grid)]

In [None]:
from pybmix.estimators.density_estimator import DensityEstimator

In [None]:
grid = np.linspace(-6, 6, 500)
dens_est = DensityEstimator(mixture)
densities = dens_est.estimate_density(grid)

Plot some of the densities and their mean

In [None]:
plt.hist(y, density=True)
plt.plot(grid, np.mean(densities, axis=0), lw=3, label="predictive density")
idxs = [5, 100, 300]
for idx in idxs:
    plt.plot(grid, densities[idx, :], "--", label="iteration: {0}".format(idx))
    
plt.legend()
plt.show()

plot the chain of the number of clusters

In [None]:
mcmc_chain = mixture.get_chain()

# extract the cluster allocations
cluster_alloc_chain = mcmc_chain.extract("cluster_allocs")

# cluster alloc chain is a matrix of shape [niter - nburn, ndata], we must count at
# each row the number of unique values
n_clust_chain = np.apply_along_axis(lambda x: len(np.unique(x)), 1, 
                                    cluster_alloc_chain)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
axes[0].vlines(np.arange(len(n_clust_chain)), n_clust_chain - 0.3, n_clust_chain + 0.3)
axes[0].set_title("Traceplot")

clusgrid = np.arange(1, 10)
probas = np.zeros_like(clusgrid)
for i, c in enumerate(clusgrid):
    probas[i] = np.sum(n_clust_chain == c)

probas = probas / np.sum(probas)
axes[1].bar(clusgrid, probas)
axes[1].set_title("Posterior number of clusters")
    
plt.show()

Find a point estimate for the clustering

In [None]:
from pybmix.estimators.cluster_estimator import ClusterEstimator

clus_est = ClusterEstimator(mixture)
best_clust = clus_est.get_point_estimate()

In [None]:
plt.hist(y, density=True, alpha=0.3)
plt.plot(grid, np.mean(densities, axis=0), lw=3, label="predictive density")
for cluster_idx in clus_est.group_by_cluster(best_clust):
    data = y[cluster_idx]
    plt.scatter(data, np.zeros_like(data) + 5e-3)
    
plt.show()

Note how the posterior mode of the number of clusters is 3, but the point estimate for the best clustering consists of 2 clusters