# Bayesian Mixture Model

This notebook illustrate how to build and train a Bayesian Mixture Model with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
# Add "beer" to the PYTHONPATH
from collections import defaultdict
import random
import sys
sys.path.insert(0, '../')

import copy

import beer
import numpy as np
import torch

# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d
output_notebook()

# Convenience functions for plotting.
import plotting

%load_ext autoreload
%autoreload 2

## Data

As an illustration, we generate a synthetic data set composed of two Normal distributed cluster. One has a diagonal covariance matrix whereas the other has a dense covariance matrix.

In [2]:
N = 100

# First cluster.
mean = np.array([-5, 5]) 
cov = .5 *np.array([[.75, 0.], [0, 5.]])
data1 = np.random.multivariate_normal(mean, cov, size=N)

# Second cluster.
mean = np.array([5, 5]) 
cov = 2 * np.array([[2, -.5], [-.5, .75]])
data2 = np.random.multivariate_normal(mean, cov, size=N)

# Merge everything to get the finale data set.
data = np.vstack([data1, data2])
np.random.shuffle(data)

# First cluster.
mean = np.array([-5, 5]) 
cov = .5 *np.array([[.75, 0.], [0, 5.]])
data1 = np.random.multivariate_normal(mean, cov, size=N)

# Second cluster.
mean = np.array([5, 5]) 
cov = 2 * np.array([[2, -.5], [-.5, .75]])
data2 = np.random.multivariate_normal(mean, cov, size=N)

# Merge everything to get the finale data set.
test_data = np.vstack([data1, data2])
np.random.shuffle(test_data)

print(data.shape)

(200, 2)


In [3]:
# Mean, variance of the data to scale the figure.
mean = data.mean(axis=0)
var = data.var(axis=0)
std_dev = np.sqrt(max(var))
x_range = (mean[0] - 2 * std_dev, mean[0] + 2 * std_dev)
y_range = (mean[1] - 2 * std_dev, mean[1] + 2 * std_dev)
global_range = (min(x_range[0], y_range[0]), max(x_range[1], y_range[1]))

fig = figure(width=300, height=300,
             x_range=global_range, y_range=global_range)
fig.circle(data[:, 0], data[:, 1])

show(fig)

## Model Creation

We create two types of mixture model: one whose (Normal) components have full covariance matrix and the other whose (Normal) components have diagonal covariance matrix.

In [56]:
# We use the global mean/cov. matrix of the data to initialize the mixture.
data_mean = torch.from_numpy(data.mean(axis=0)).double()
data_var = torch.from_numpy(np.var(data, axis=0)).double()
print(data_mean), print(data_var)

noise_std = 1
shared_cov = False
cov_type = 'full'
ncomps = 20

modelset = beer.NormalSet.create(data_mean, data_var, size=ncomps,
                                prior_strength=1., noise_std=noise_std, 
                                cov_type=cov_type, shared_cov=False)
model = beer.Mixture.create(modelset, prior_strength=1).double()

tensor([-0.0951,  4.9145], dtype=torch.float64)
tensor([25.4187,  2.1149], dtype=torch.float64)


## Variational Bayes Training 

In [57]:
nbatches = 1
X = torch.from_numpy(data).double()
test_X = torch.from_numpy(test_data).double()
batches = X.view(nbatches, -1, 2)
batches.shape

torch.Size([1, 200, 2])

In [58]:
epochs = 500
lrate = 1.

optim = beer.VariationalBayesOptimizer(model.mean_field_factorization(), lrate)
    
elbos = []
for epoch in range(epochs):
    batch_ids = list(range(len(batches)))
    random.shuffle(batch_ids)
    for batch_id in batch_ids:
        batch = batches[batch_id]
        optim.init_step()
        elbo = beer.evidence_lower_bound(model, batch, datasize=len(X))
        elbo.backward()
        optim.step()
        elbos.append(float(elbo) / len(X))

In [59]:
fig = figure(width=400, height=400, x_axis_label='epoch',
              y_axis_label='ELBO')
fig.line(range(len(elbos)), elbos)
show(fig)

In [60]:
weights = model.weights.expected_value().numpy()
fig = figure()
fig.circle(data[:, 0], data[:, 1])
for weight, normal in zip(weights, model.modelset):
    mean = normal.expected_mean.numpy()
    cov = normal.expected_cov.numpy()
    plotting.plot_normal(fig, mean, cov, alpha=.5 * weight)
show(fig)

In [19]:
model.modelset.means_precision.posterior.means[0], model.modelset.means_precision.posterior.means[0] 

(tensor([-4.9587,  5.0462], dtype=torch.float64),
 tensor([-4.9587,  5.0462], dtype=torch.float64))

In [11]:
model.modelset.means_precision.posterior

JointNormalGamma(
  (params): JointNormalGammaStdParams(means=tensor([[-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653],
          [-0.0848,  4.9653]], dtype=torch.float64), scales=tensor([  1.0000,   1.0000, 200.9999,   1.0000,   1.0000,   1.0000,   1.0000,
            1.0000,   1.0000,   1.0000], dtype=torch.float64), shape=tensor(101.0000, dtype=torch.float64), rates=tensor([2575.6676,  187.8899], dtype=torch.float64))
)

In [9]:
# We use the global mean/cov. matrix of the data to initialize the mixture.
data_mean = torch.from_numpy(data.mean(axis=0)).double()
data_var = torch.from_numpy(np.var(data, axis=0)).double()
#data_mean = torch.zeros(2).double()
#data_var = torch.ones(2).double()
noise_std = .1

# Isotropic covariance.
modelset = beer.NormalSet.create(data_mean, data_var, size=10,
                                prior_strength=1., noise_std=noise_std, 
                                cov_type='isotropic')
gmm_iso = beer.Mixture.create(modelset, prior_strength=1.)

# Diagonal covariance.
modelset = beer.NormalSet.create(data_mean, data_var, size=10,
                                prior_strength=1., noise_std=noise_std, 
                                cov_type='diagonal')
gmm_diag = beer.Mixture.create(modelset, prior_strength=1.)

# Full covariance.
modelset = beer.NormalSet.create(data_mean, data_var, size=10,
                                prior_strength=1., noise_std=noise_std, 
                                cov_type='full')
gmm_full = beer.Mixture.create(modelset, prior_strength=1.)

# Shared isotropic covariance.
modelset = beer.NormalSet.create(data_mean, data_var.diag(), size=10,
                                 prior_strength=1., noise_std=noise_std,
                                 cov_type='isotropic', shared_cov=True)
gmm_iso_shared = beer.Mixture.create(modelset, prior_strength=1.)

# Shared diagonal covariance.
modelset = beer.NormalSet.create(data_mean, data_var.diag(), size=10,
                                 prior_strength=1., noise_std=noise_std,
                                 cov_type='diagonal', shared_cov=True)
gmm_diag_shared = beer.Mixture.create(modelset, prior_strength=1.)

# Shared full covariance.
modelset = beer.NormalSet.create(data_mean, data_var.diag(), size=10,
                                 prior_strength=1., noise_std=noise_std,
                                 cov_type='full', shared_cov=True)
gmm_full_shared = beer.Mixture.create(modelset, prior_strength=1.)

models = {
    'gmm_iso': gmm_iso,
    'gmm_diag': gmm_diag,
    'gmm_full': gmm_full,
    #'gmm_iso_shared': gmm_iso_shared,
    #'gmm_diag_shared': gmm_diag_shared,
    #'gmm_full_shared': gmm_full_shared,
}

In [10]:
nbatches = len(X)
X = torch.from_numpy(data).double()
test_X = torch.from_numpy(test_data).double()
batches = X.view(nbatches, -1, 2)
batches.shape

torch.Size([600, 1, 2])

In [11]:
iters = 100

batch_stats = {model_name: defaultdict(lambda: defaultdict(lambda: None))
               for model_name in models}
optims = {model_name: beer.CVBOptimizer(model.bayesian_parameters())
          for model_name, model in models.items()}
elbos = {model_name: [] for model_name in models}  

for epoch in range(iters):
    batch_ids = list(range(len(batches)))
    random.shuffle(batch_ids)
    for name, model in models.items():
        for batch_id in batch_ids:
            optim = optims[name]
            optim.init_step(batch_stats[name][batch_id])
            elbo = beer.collapsed_evidence_lower_bound(model, batches[batch_id])
            batch_stats[name][batch_id] = elbo.backward()
            optim.step()
        log_pred = beer.evidence_lower_bound(model, X)
        elbos[name].append(float(log_pred) / len(X))

KeyboardInterrupt: 

In [None]:
colors = {
    'gmm_iso': 'green',
    'gmm_diag': 'blue',
    'gmm_full': 'red',
    'gmm_iso_shared': 'grey',
    'gmm_diag_shared': 'brown',
    'gmm_full_shared': 'black'
    
}
# Plot the ELBO.
elbo_fig = figure(width=500, height=500, x_axis_label='epoch',
                  y_axis_label='ELBO (VBI)')
for model_name, elbo in elbos.items():
    elbo_fig.line(range(len(elbo)), elbo, line_dash='dashed', legend=model_name + ' (CVB)',
                  color=colors[model_name])
for model_name, elbo in svb_elbos.items():
    elbo_fig.line(range(len(elbo)), elbo, legend=model_name + ' (VBI)',
                  color=colors[model_name])
elbo_fig.legend.location = 'bottom_right'

show(elbo_fig)

In [None]:
2 

In [None]:
gmm_full = models['gmm_full']
gmm_full.modelset.means_precisions[2].posterior.natural_parameters

In [None]:
figs = []
for i, model_name in enumerate(models):
    model = models[model_name]
    fig = figure(x_range=global_range, y_range=global_range,
              width=250, height=250)
    fig.circle(data[:, 0], data[:, 1], alpha=.1)
    plotting.plot_gmm(fig, model, alpha=.5, color=colors[model_name])
    show(fig)

In [None]:
models['gmm_full'].weights.expected_value()

In [None]:
joint_nparams = gmm_full_shared.modelset.means_precision.posterior.natural_parameters
np1, np2 = gmm_full_shared.modelset._split_natural_parameters(joint_nparams)
np1 = torch.ones(len(np2), 1, dtype=np1.dtype) * np1.view(1, -1)
np1.shape, np2.shape
nparams1 = torch.cat([
    np1[:, :-1], 
    np2,
    np1[:, -1].view(-1, 1)
], dim=1)
nparams1 = nparams1[None]

In [None]:
stats = gmm_full_shared.modelset.sufficient_statistics(X)
nparams2 = stats[:, None, :] + nparams1
nparams2.shape

In [None]:
post = gmm_full_shared.modelset.means_precision.posterior
m_llh = post.joint_log_norm(nparams2) - post.joint_log_norm(nparams1)
m_llh

In [None]:
m_llh[:10]

In [None]:
gmm_full_shared.modelset.marginal_log_likelihood(stats)