# Bayesian Nested Mixture Model

This notebook illustrate how to build and train a Bayesian Nested Mixture Model with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
# Add "beer" to the PYTHONPATH
import sys
sys.path.insert(0, '../')

import copy

import beer
import numpy as np
import torch

# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d
output_notebook()

# Convenience functions for plotting.
import plotting

%load_ext autoreload
%autoreload 2

## Data

As an illustration, we generate a synthetic data set composed of two Normal distributed cluster. One has a diagonal covariance matrix whereas the other has a dense covariance matrix.

In [2]:
# First cluster.
mean = np.array([-5, 5]) 
cov = .5 *np.array([[.75, .5], [.5, 2.]])
data1 = np.random.multivariate_normal(mean, cov, size=200)

# Second cluster.
mean = np.array([5, 5]) 
cov = 2 * np.array([[2, -.5], [-.5, .75]])
data2 = np.random.multivariate_normal(mean, cov, size=200)

# Merge everything to get the finale data set.
data = np.vstack([data1, data2])
np.random.shuffle(data)

# We use the global mean/cov. matrix of the data to initialize the mixture.
data_mean = torch.from_numpy(data.mean(axis=0)).float()
data_var = torch.from_numpy(np.var(data, axis=0)).float()

In [3]:
# Mean, variance of the data to scale the figure.
mean = data.mean(axis=0)
var = data.var(axis=0)
std_dev = np.sqrt(max(var))
x_range = (mean[0] - 2 * std_dev, mean[0] + 2 * std_dev)
y_range = (mean[1] - 2 * std_dev, mean[1] + 2 * std_dev)
global_range = (min(x_range[0], y_range[0]), max(x_range[1], y_range[1]))

fig = figure(title='Data', width=400, height=400,
             x_range=global_range, y_range=global_range)
fig.circle(data[:, 0], data[:, 1])

show(fig)

## Model Creation

We create two types of mixture model: one whose (Normal) components have full covariance matrix and the other whose (Normal) components have diagonal covariance matrix.

In [4]:
# We use the global mean/cov. matrix of the data to initialize the mixture.
data_mean = torch.from_numpy(data.mean(axis=0)).float()
data_var = torch.from_numpy(np.var(data, axis=0)).float()

# Isotropic covariance.
modelset = beer.NormalSet.create(data_mean, data_var, size=10,
                                prior_strength=1., noise_std=1., 
                                cov_type='isotropic')
mixtureset = beer.MixtureSet.create(2, modelset)
m_gmm_iso = beer.Mixture.create(mixtureset)

# Diagonal covariance.
modelset = beer.NormalSet.create(data_mean, data_var, size=10,
                                prior_strength=1., noise_std=1., 
                                cov_type='diagonal')
mixtureset = beer.MixtureSet.create(2, modelset)
m_gmm_diag = beer.Mixture.create(mixtureset)

# Full covariance.
modelset = beer.NormalSet.create(data_mean, data_var, size=10,
                                prior_strength=1., noise_std=1., 
                                cov_type='full')
mixtureset = beer.MixtureSet.create(2, modelset)
m_gmm_full = beer.Mixture.create(mixtureset)

# Shared isotropic covariance.
modelset = beer.NormalSet.create(data_mean, data_var.diag(), size=10,
                                 prior_strength=1., noise_std=1.,
                                 cov_type='isotropic', shared_cov=True)
mixtureset = beer.MixtureSet.create(2, modelset)
m_gmm_iso_shared = beer.Mixture.create(mixtureset)

# Shared diagonal covariance.
modelset = beer.NormalSet.create(data_mean, data_var.diag(), size=10,
                                 prior_strength=1., noise_std=1.,
                                 cov_type='diagonal', shared_cov=True)
mixtureset = beer.MixtureSet.create(2, modelset)
m_gmm_diag_shared = beer.Mixture.create(mixtureset)

# Shared full covariance.
modelset = beer.NormalSet.create(data_mean, data_var.diag(), size=10,
                                 prior_strength=1., noise_std=1.,
                                 cov_type='full', shared_cov=True)
mixtureset = beer.MixtureSet.create(2, modelset)
m_gmm_full_shared = beer.Mixture.create(mixtureset)

models = {
    'm_gmm_iso': m_gmm_iso,
    'm_gmm_diag': m_gmm_diag,
    'm_gmm_full': m_gmm_full,
    'm_gmm_iso_shared': m_gmm_iso_shared,
    'm_gmm_diag_shared': m_gmm_diag_shared,
    'm_gmm_full_shared': m_gmm_full_shared,
}

## Variational Bayes Training 

In [12]:
epochs = 100
lrate = 1.
X = torch.from_numpy(data).float()

optims = {
    model_name: beer.BayesianModelCoordinateAscentOptimizer(model.mean_field_groups, lrate)
    for model_name, model in models.items()
}

elbos = {
    model_name: [] 
    for model_name in models
}  
    
for epoch in range(epochs):
    for name, model in models.items():
        optim = optims[name]
        optim.zero_grad()
        elbo = beer.evidence_lower_bound(model, X, datasize=len(X))
        elbo.natural_backward()
        elbos[name].append(float(elbo) / len(X))
        optim.step()

In [15]:
colors = {
    'm_gmm_iso': 'green',
    'm_gmm_diag': 'blue',
    'm_gmm_full': 'red',
    'm_gmm_iso_shared': 'grey',
    'm_gmm_diag_shared': 'brown',
    'm_gmm_full_shared': 'black'
}
# Plot the ELBO.
fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
for model_name, elbo in elbos.items():
    print(model_name)
    fig.line(range(len(elbo)), elbo, legend=model_name, color=colors[model_name])
fig.legend.location = 'bottom_right'

show(fig)

m_gmm_iso
m_gmm_diag
m_gmm_full
m_gmm_iso_shared
m_gmm_diag_shared
m_gmm_full_shared


In [16]:
figs = []
for i, model_name in enumerate(models):
    print(model_name)
    fig = figure(title=model_name, x_range=global_range, y_range=global_range,
                  width=250, height=250)
    model = models[model_name]
    weights = model.weights.expected_value()
    for i, gmm in enumerate(model.modelset):
        fig.circle(data[:, 0], data[:, 1], alpha=.1)
        plotting.plot_gmm(fig, gmm, alpha=.5 * weights)
    if i % 3 == 0:
        figs.append([])
    figs[-1].append(fig)
grid = gridplot(figs)
show(grid)

m_gmm_iso
tensor([ 0.3080,  0.6899,  0.0007,  0.0007,  0.0007])
tensor([ 0.9927,  0.0018,  0.0018,  0.0018,  0.0018])
m_gmm_diag
tensor([ 0.0020,  0.0020,  0.0020,  0.9920,  0.0020])
tensor([ 0.0007,  0.0007,  0.3348,  0.0007,  0.6632])
m_gmm_full
tensor([ 0.0010,  0.0010,  0.0010,  0.0010,  0.9960])
tensor([ 0.0010,  0.0010,  0.9960,  0.0010,  0.0010])
m_gmm_iso_shared
tensor([ 0.3215,  0.0007,  0.6764,  0.0007,  0.0007])
tensor([ 0.0019,  0.0019,  0.0019,  0.0019,  0.9922])
m_gmm_diag_shared
tensor([ 0.0020,  0.0020,  0.9918,  0.0020,  0.0020])
tensor([ 0.0007,  0.0007,  0.0007,  0.6639,  0.3341])
m_gmm_full_shared
tensor([ 0.5022,  0.0005,  0.2471,  0.2498,  0.0005])
tensor([ 0.2000,  0.2000,  0.2000,  0.2000,  0.2000])
