# Bayesian Nested Mixture Model

This notebook illustrate how to build and train a Bayesian Nested Mixture Model with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
# Add "beer" to the PYTHONPATH
import sys
sys.path.insert(0, '../')

import copy

import beer
import numpy as np
import torch

# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d
output_notebook()

# Convenience functions for plotting.
import plotting

%load_ext autoreload
%autoreload 2

## Data

As an illustration, we generate a synthetic data set composed of two Normal distributed cluster. One has a diagonal covariance matrix whereas the other has a dense covariance matrix.

In [2]:
# First cluster.
mean = np.array([-5, 5]) 
cov = .5 *np.array([[.75, .5], [.5, 2.]])
data1 = np.random.multivariate_normal(mean, cov, size=200)

# Second cluster.
mean = np.array([5, 5]) 
cov = 2 * np.array([[2, -.5], [-.5, .75]])
data2 = np.random.multivariate_normal(mean, cov, size=200)

# Merge everything to get the finale data set.
data = np.vstack([data1, data2])
np.random.shuffle(data)

# We use the global mean/cov. matrix of the data to initialize the mixture.
data_mean = torch.from_numpy(data.mean(axis=0)).float()
data_var = torch.from_numpy(np.var(data, axis=0)).float()

In [3]:
# Mean, variance of the data to scale the figure.
mean = data.mean(axis=0)
var = data.var(axis=0)
std_dev = np.sqrt(max(var))
x_range = (mean[0] - 2 * std_dev, mean[0] + 2 * std_dev)
y_range = (mean[1] - 2 * std_dev, mean[1] + 2 * std_dev)
global_range = (min(x_range[0], y_range[0]), max(x_range[1], y_range[1]))

fig = figure(title='Data', width=400, height=400,
             x_range=global_range, y_range=global_range)
fig.circle(data[:, 0], data[:, 1])

show(fig)

## Model Creation

We create two types of mixture model: one whose (Normal) components have full covariance matrix and the other whose (Normal) components have diagonal covariance matrix.

In [4]:
conf_template = '''
type: Mixture
prior_strength: 1.
components:
    type: MixtureSet
    size: {size}
    prior_strength: 1.
    components: 
        type: NormalSet
        size: 10
        covariance: {cov_type}
        shared_covariance: {shared_cov}
        prior_strength: 1.e-1
        noise_std: .5
'''

confs = [
    conf_template.format(size=2, cov_type='isotropic', shared_cov=False),
    conf_template.format(size=2, cov_type='diagonal', shared_cov=False),
    conf_template.format(size=2, cov_type='full', shared_cov=False),
    conf_template.format(size=2, cov_type='isotropic', shared_cov=True),
    conf_template.format(size=2, cov_type='diagonal', shared_cov=True),
    conf_template.format(size=2, cov_type='full', shared_cov=True)
]

names = [
    'm_gmm_iso', 
    'm_gmm_diag', 
    'm_gmm_full', 
    'm_gmm_iso_shared', 
    'm_gmm_diag_shared', 
    'm_gmm_full_shared'
]

import yaml
models = {
    name: beer.create_model(yaml.load(conf), data_mean, .1 * data_var)
    for name, conf in zip(names, confs)
}

## Variational Bayes Training 

In [5]:
epochs = 100
lrate = 1.
X = torch.from_numpy(data).float()

optims = {
    model_name: beer.BayesianModelCoordinateAscentOptimizer(model.mean_field_groups, lrate)
    for model_name, model in models.items()
}

elbos = {
    model_name: [] 
    for model_name in models
}  
    
for epoch in range(epochs):
    for name, model in models.items():
        optim = optims[name]
        optim.zero_grad()
        elbo = beer.evidence_lower_bound(model, X, datasize=len(X))
        elbo.natural_backward()
        elbos[name].append(float(elbo) / len(X))
        optim.step()

In [6]:
colors = {
    'm_gmm_iso': 'green',
    'm_gmm_diag': 'blue',
    'm_gmm_full': 'red',
    'm_gmm_iso_shared': 'grey',
    'm_gmm_diag_shared': 'brown',
    'm_gmm_full_shared': 'black'
}
# Plot the ELBO.
fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
for model_name, elbo in elbos.items():
    fig.line(range(len(elbo)), elbo, legend=model_name, color=colors[model_name])
fig.legend.location = 'bottom_right'

show(fig)

In [7]:
figs = []
for i, model_name in enumerate(models):
    fig = figure(title=model_name, x_range=global_range, y_range=global_range,
                  width=300, height=300)
    model = models[model_name]
    for gmm in model.modelset:
        fig.circle(data[:, 0], data[:, 1], alpha=.1)
        plotting.plot_gmm(fig, gmm, alpha=.5)
    if i % 3 == 0:
        figs.append([])
    figs[-1].append(fig)
grid = gridplot(figs)
show(grid)

In [10]:
shared_mset = beer.SharedModelSet(models['m_gmm_iso'].modelset.modelset, 3)
shared_mset

<beer.models.mixtureset.SharedModelSet at 0x1c1e1566a0>

In [14]:
stats = shared_mset.sufficient_statistics(X)
pc_exp_llh = shared_mset(stats)


torch.Size([400, 60])

In [23]:
weights = 1 + torch.randn(400, 60) ** 2
weights /= weights.sum(dim=-1)[:, None]
weights

tensor([[ 0.0094,  0.0092,  0.0122,  ...,  0.0111,  0.0108,  0.0169],
        [ 0.0192,  0.0081,  0.0320,  ...,  0.0142,  0.0120,  0.0279],
        [ 0.0102,  0.0075,  0.0092,  ...,  0.0076,  0.0078,  0.0254],
        ...,
        [ 0.0202,  0.0210,  0.0150,  ...,  0.0204,  0.0262,  0.0084],
        [ 0.0279,  0.0451,  0.0090,  ...,  0.0148,  0.0199,  0.0248],
        [ 0.0282,  0.0403,  0.0112,  ...,  0.0102,  0.0114,  0.0117]])

In [None]:
shared_mset.accumulate(stats, weights)

> /Users/lucasondel/Projects/beer/beer/models/mixtureset.py(138)accumulate()
-> new_weights = new_weights.sum(dim=1)
(Pdb) ll
130  	    def accumulate(self, s_stats, parent_msg=None):
131  	        s_stats = s_stats
132  	        if parent_msg is None:
133  	            raise ValueError('"parent_msg" should not be None')
134  	        weights = parent_msg
135  	        new_weights = weights.reshape(-1, self.n_duplicate, len(self._modelset))
136  	        import pdb
137  	        pdb.set_trace()
138  ->	        new_weights = new_weights.sum(dim=1)
139  	        return self._modelset.accumulate(s_stats, parent_msg=new_weights)
(Pdb) new_weights.shape
torch.Size([400, 3, 20])
(Pdb) new_weights.sum(dim=1)
tensor([[ 0.0277,  0.0293,  0.0362,  ...,  0.0627,  0.1095,  0.0353],
        [ 0.0839,  0.0512,  0.0784,  ...,  0.0469,  0.0355,  0.0480],
        [ 0.0756,  0.0432,  0.0318,  ...,  0.0667,  0.0694,  0.0676],
        ...,
        [ 0.0566,  0.0601,  0.0372,  ...,  0.0871,  0.0451,  0.029

(Pdb) ll
130  	    def accumulate(self, s_stats, parent_msg=None):
131  	        s_stats = s_stats
132  	        if parent_msg is None:
133  	            raise ValueError('"parent_msg" should not be None')
134  	        weights = parent_msg
135  	        new_weights = weights.reshape(-1, self.n_duplicate, len(self._modelset))
136  	        import pdb
137  	        pdb.set_trace()
138  ->	        new_weights = new_weights.sum(dim=1)
139  	        return self._modelset.accumulate(s_stats, parent_msg=new_weights)
(Pdb) new_weights = weights.reshape(-1, len(self._modelset), self.n_duplicate)
(Pdb) new_weights.shape
torch.Size([400, 20, 3])
(Pdb) new_weights
tensor([[[ 0.0094,  0.0092,  0.0122],
         [ 0.0090,  0.0324,  0.0264],
         [ 0.0091,  0.0128,  0.0093],
         ...,
         [ 0.0239,  0.0110,  0.0095],
         [ 0.0098,  0.0378,  0.0092],
         [ 0.0111,  0.0108,  0.0169]],

        [[ 0.0192,  0.0081,  0.0320],
         [ 0.0335,  0.0111,  0.0117],
         [ 0.0245,