# Bayesian Mixture Model

This notebook illustrate how to build and train a Bayesian Mixture Model with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
# Add "beer" to the PYTHONPATH
import sys
sys.path.insert(0, '../')

import copy

import beer
import numpy as np
import torch

# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d
output_notebook()

# Convenience functions for plotting.
import plotting

%load_ext autoreload
%autoreload 2

## Data

As an illustration, we generate a synthetic data set composed of two Normal distributed cluster. One has a diagonal covariance matrix whereas the other has a dense covariance matrix.

In [2]:
N = 200

# First cluster.
mean = np.array([-5, 5]) 
cov = .5 *np.array([[.75, 0.], [0, 5.]])
data1 = np.random.multivariate_normal(mean, cov, size=N)

# Second cluster.
mean = np.array([5, 5]) 
cov = 2 * np.array([[2, -.5], [-.5, .75]])
data2 = np.random.multivariate_normal(mean, cov, size=N)

# Merge everything to get the finale data set.
data = np.vstack([data1, data2])
np.random.shuffle(data)

# Prepare the data for pytorch
X = torch.from_numpy(data).double()


fig = figure(width=400, height=400,
             x_range=(-10, 10), y_range=(-5, 15))
fig.circle(data[:, 0], data[:, 1])

show(fig)

## Model Creation

In [3]:
data_mean = torch.from_numpy(data.mean(axis=0)).double()
data_var = torch.from_numpy(np.var(data, axis=0)).double()

print(data_var.shape)
modelset = beer.NormalSet.create(
    data_mean, data_var,      # use to set the mean/variance of the prior
    size=10,                  # total number of components in the mixture
    prior_strength=1.,        # how much the prior affect the training ("pseudo-counts")
    noise_std=1.,              # standard deviation of the noise to initialize the mean of the posterior
    cov_type='full',          # type of the covariance matrix  ('full', 'diagonal' or 'isotropic')
)

model = beer.Mixture.create(
    modelset, 
    prior_strength=1.          # how much the prior over the weights will affect the training ("pseudo-counts")
)

model = model.double()        # set all the parameters in double precision
#model = model.cuda()          # move the model on a GPU. If you do so, you'll have
                               # to move the data as well.
    
print(model)

torch.Size([2])
Mixture(
  (modelset): NormalSet(
    (means_precisions): ConjugateBayesianParameter(prior=NormalWishart, posterior=NormalWishart)
  )
  (categorical): Categorical(
    (weights): ConjugateBayesianParameter(prior=Dirichlet, posterior=Dirichlet)
  )
)


In [4]:
weights = model.categorical.mean.numpy()
fig = figure(title='Components', width=400, height=400,
             x_range=(-10, 10), y_range=(-5, 15))
fig.circle(data[:, 0], data[:, 1], alpha=.5)
for weight, normal in zip(weights, model.modelset):
    mean = normal.mean.numpy()
    cov = normal.cov.numpy()
    plotting.plot_normal(fig, mean, cov, alpha=.5 * weight, color='green')
    
fig2 = figure(width=400, height=400, y_range=(-0.1, 1.1), title='Mixing weights')
fig2.vbar(range(len(weights)), width=.5, top=weights)
fig2.xaxis.ticker = list(range(len(weights)))
fig2.xgrid.visible = False
show(gridplot([[fig, fig2]]))

## Variational Bayes Training 

In [5]:
epochs = 100
lrate = 1.

optim = beer.VBConjugateOptimizer(
    model.mean_field_factorization(),
    lrate
)
    
elbos = []
for epoch in range(epochs):
    optim.init_step()
    elbo = beer.evidence_lower_bound(model, X)
    elbo.backward()
    optim.step()
    elbos.append(float(elbo) / len(X))
    
# Plot the evolution of the ELBO.
fig = figure(width=400, height=400, x_axis_label='epoch',
              y_axis_label='ELBO')
fig.line(range(len(elbos) - 1), elbos[1:])
show(fig)

In [6]:
weights = model.categorical.mean.numpy()
fig = figure(title='Components', width=400, height=400,
             x_range=(-10, 10), y_range=(-5, 15))
fig.circle(data[:, 0], data[:, 1], alpha=.5)
for weight, normal in zip(weights, model.modelset):
    mean = normal.mean.numpy()
    cov = normal.cov.numpy()
    plotting.plot_normal(fig, mean, cov, alpha=.5 * weight, color='green')
    
fig2 = figure(width=400, height=400, y_range=(-0.1, 1.1), title='Mixing weights')
fig2.vbar(range(len(weights)), width=.5, top=weights)
fig2.xaxis.ticker = list(range(len(weights)))
fig2.xgrid.visible = False
show(gridplot([[fig, fig2]]))

## Dirichlet Process Mixture Model

In [7]:
data_mean = torch.from_numpy(data.mean(axis=0)).double()
data_var = torch.from_numpy(np.var(data, axis=0)).double()

modelset = beer.NormalSet.create(
    data_mean, data_var,      # use to set the mean/variance of the prior
    size=20,                  # total number of components in the mixture
    prior_strength=1.,        # how much the prior affect the training ("pseudo-counts")
    noise_std=1,              # standard deviation of the noise to initialize the mean of the posterior
    cov_type='full',          # type of the covariance matrix  ('full', 'diagonal' or 'isotropic')
)

stickbreaking = beer.SBCategorical.create(truncation=len(modelset), prior_strength=10)
model = beer.Mixture.create(
    modelset, 
    categorical=stickbreaking,
    prior_strength=1.          # how much the prior over the weights will affect the training ("pseudo-counts")
)

model = model.double()        # set all the parameters in double precision
#model = model.cuda()          # move the model on a GPU. If you do so, you'll have
                               # to move the data as well.
    
print(model)

Mixture(
  (modelset): NormalSet(
    (means_precisions): ConjugateBayesianParameter(prior=NormalWishart, posterior=NormalWishart)
  )
  (categorical): SBCategorical(
    (stickbreaking): ConjugateBayesianParameter(prior=Dirichlet, posterior=Dirichlet)
  )
)


In [8]:
weights = model.categorical.mean.numpy()
fig = figure(title='Components', width=400, height=400,
             x_range=(-10, 10), y_range=(-5, 15))
fig.circle(data[:, 0], data[:, 1], alpha=.5)
for weight, normal in zip(weights, model.modelset):
    mean = normal.mean.numpy()
    cov = normal.cov.numpy()
    plotting.plot_normal(fig, mean, cov, alpha=.5 * weight, color='green')
    
fig2 = figure(width=400, height=400, y_range=(-0.1, 1.1), title='Mixing weights')
fig2.vbar(range(len(weights)), width=.5, top=weights)
fig2.xaxis.ticker = list(range(len(weights)))
fig2.xgrid.visible = False
show(gridplot([[fig, fig2]]))

In [9]:
epochs = 100
lrate = 1.

optim = beer.VBConjugateOptimizer(
    model.mean_field_factorization(),
    lrate
)
    
elbos = []
for epoch in range(epochs):
    optim.init_step()
    elbo = beer.evidence_lower_bound(model, X)
    elbo.backward()
    optim.step()
    elbos.append(float(elbo) / len(X))
    
# Plot the evolution of the ELBO.
fig = figure(width=400, height=400, x_axis_label='epoch',
              y_axis_label='ELBO')
fig.line(range(len(elbos) - 1), elbos[1:])
show(fig)

In [10]:
weights = model.categorical.mean.numpy()
fig = figure(title='Components', width=400, height=400,
             x_range=(-10, 10), y_range=(-5, 15))
fig.circle(data[:, 0], data[:, 1], alpha=.5)
for weight, normal in zip(weights, model.modelset):
    mean = normal.mean.numpy()
    cov = normal.cov.numpy()
    plotting.plot_normal(fig, mean, cov, alpha=.5 * weight, color='green')
    
fig2 = figure(width=400, height=400, y_range=(-0.1, 1.1), title='Mixing weights')
fig2.vbar(range(len(weights)), width=.5, top=weights)
fig2.xaxis.ticker = list(range(len(weights)))
fig2.xgrid.visible = False
show(gridplot([[fig, fig2]]))

## Dirichlet Process Mixture Model with hyper-prior

In [40]:
data_mean = torch.from_numpy(data.mean(axis=0)).double()
data_var = torch.from_numpy(np.var(data, axis=0)).double()

modelset = beer.NormalSet.create(
    data_mean, data_var,      # use to set the mean/variance of the prior
    size=20,                  # total number of components in the mixture
    prior_strength=1.,        # how much the prior affect the training ("pseudo-counts")
    noise_std=1,              # standard deviation of the noise to initialize the mean of the posterior
    cov_type='full',          # type of the covariance matrix  ('full', 'diagonal' or 'isotropic')
)

stickbreaking = beer.SBCategoricalHyperPrior.create(truncation=len(modelset), prior_strength=10., hyper_prior_strength=1)
model = beer.Mixture.create(
    modelset, 
    categorical=stickbreaking,
    prior_strength=1.          # how much the prior over the weights will affect the training ("pseudo-counts")
)

model = model.double()        # set all the parameters in double precision
#model = model.cuda()          # move the model on a GPU. If you do so, you'll have
                               # to move the data as well.
    
print(model)

Mixture(
  (modelset): NormalSet(
    (means_precisions): ConjugateBayesianParameter(prior=NormalWishart, posterior=NormalWishart)
  )
  (categorical): SBCategoricalHyperPrior(
    (stickbreaking): ConjugateBayesianParameter(prior=Dirichlet, posterior=Dirichlet)
    (concentration): ConjugateBayesianParameter(prior=Gamma, posterior=Gamma)
  )
)


In [41]:
def gamma_lh(x, pdf):
    from scipy.special import gammaln
    shape = float(pdf.params.shape)
    rate = float(pdf.params.rate)
    return np.exp(-rate * x + (shape - 1) * np.log(x) - gammaln(shape) + shape * np.log(rate))

weights = model.categorical.mean.numpy()
fig = figure(title='Components', width=400, height=400,
             x_range=(-10, 10), y_range=(-5, 15))
fig.circle(data[:, 0], data[:, 1], alpha=.5)
for weight, normal in zip(weights, model.modelset):
    mean = normal.mean.numpy()
    cov = normal.cov.numpy()
    plotting.plot_normal(fig, mean, cov, alpha=.5 * weight, color='green')
    
fig2 = figure(width=400, height=400, y_range=(-0.1, 1.1), title='Mixing weights')
fig2.vbar(range(len(weights)), width=.5, top=weights)
fig2.xaxis.ticker = list(range(len(weights)))
fig2.xgrid.visible = False

fig3 = figure(width=400, height=400, title='Prior/posterior over the concentration', x_axis_type='log')
c = np.linspace(0.01, 10, 1000)
prior_lh = gamma_lh(c, model.categorical.concentration.prior)
posterior_lh = gamma_lh(c, model.categorical.concentration.posterior)
fig3.line(c, prior_lh, color='blue')
fig3.line(c, posterior_lh, color='green')

show(gridplot([[fig, fig2, fig3]]))

In [42]:
epochs = 100
lrate = 1.

optim = beer.VBConjugateOptimizer(
    model.mean_field_factorization(),
    lrate
)
    
elbos = []
for epoch in range(epochs):
    optim.init_step()
    elbo = beer.evidence_lower_bound(model, X)
    elbo.backward()
    optim.step()
    elbos.append(float(elbo) / len(X))
    
# Plot the evolution of the ELBO.
fig = figure(width=400, height=400, x_axis_label='epoch',
              y_axis_label='ELBO')
fig.line(range(len(elbos) - 1), elbos[1:])
show(fig)

In [43]:
weights = model.categorical.mean.numpy()
fig = figure(title='Components', width=400, height=400,
             x_range=(-10, 10), y_range=(-5, 15))
fig.circle(data[:, 0], data[:, 1], alpha=.5)
for weight, normal in zip(weights, model.modelset):
    mean = normal.mean.numpy()
    cov = normal.cov.numpy()
    plotting.plot_normal(fig, mean, cov, alpha=.5 * weight, color='green')
    
fig2 = figure(width=400, height=400, y_range=(-0.1, 1.1), title='Mixing weights')
fig2.vbar(range(len(weights)), width=.5, top=weights)
fig2.xaxis.ticker = list(range(len(weights)))
fig2.xgrid.visible = False

fig3 = figure(width=400, height=400, title='Prior/posterior over the concentration', x_axis_type='log')
c = np.linspace(0.01, 5, 1000)
prior_lh = gamma_lh(c, model.categorical.concentration.prior)
posterior_lh = gamma_lh(c, model.categorical.concentration.posterior)
fig3.line(c, prior_lh, color='blue')
fig3.line(c, posterior_lh, color='green')

show(gridplot([[fig, fig2, fig3]]))

In [44]:
model.categorical.stickbreaking.prior.natural_parameters()

tensor([[ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786],
        [ 0.0000, -0.6786]], dtype=torch.float64)