# Bayesian Nested Mixture Model

This notebook illustrate how to build and train a Bayesian Nested Mixture Model with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
# Add "beer" to the PYTHONPATH
import sys
sys.path.insert(0, '../')

import copy

import beer
import numpy as np
import torch

# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d
output_notebook()

# Convenience functions for plotting.
import plotting

%load_ext autoreload
%autoreload 2

## Data

As an illustration, we generate a synthetic data set composed of two Normal distributed cluster. One has a diagonal covariance matrix whereas the other has a dense covariance matrix.

In [2]:
# First cluster.
mean = np.array([-5, 5]) 
cov = .5 *np.array([[.75, .5], [.5, 2.]])
data1 = np.random.multivariate_normal(mean, cov, size=200)

# Second cluster.
mean = np.array([5, 5]) 
cov = 2 * np.array([[2, -.5], [-.5, .75]])
data2 = np.random.multivariate_normal(mean, cov, size=200)

# Merge everything to get the finale data set.
data = np.vstack([data1, data2])
np.random.shuffle(data)

# We use the global mean/cov. matrix of the data to initialize the mixture.
data_mean = torch.from_numpy(data.mean(axis=0)).float()
data_var = torch.from_numpy(np.var(data, axis=0)).float()

In [3]:
# Mean, variance of the data to scale the figure.
mean = data.mean(axis=0)
var = data.var(axis=0)
std_dev = np.sqrt(max(var))
x_range = (mean[0] - 2 * std_dev, mean[0] + 2 * std_dev)
y_range = (mean[1] - 2 * std_dev, mean[1] + 2 * std_dev)
global_range = (min(x_range[0], y_range[0]), max(x_range[1], y_range[1]))

fig = figure(title='Data', width=400, height=400,
             x_range=global_range, y_range=global_range)
fig.circle(data[:, 0], data[:, 1])

show(fig)

## Model Creation

We create two types of mixture model: one whose (Normal) components have full covariance matrix and the other whose (Normal) components have diagonal covariance matrix.

In [4]:
nmixtures = 4
ncomp_per_mixture = 3
total_components = nmixtures * ncomp_per_mixture

# We use the global mean/cov. matrix of the data to initialize the mixture.
data_mean = torch.from_numpy(data.mean(axis=0)).float()
data_var = torch.from_numpy(np.var(data, axis=0)).float()

# Isotropic covariance.
modelset = beer.NormalSet.create(
    data_mean, data_var, 
    size=total_components,
    prior_strength=1., 
    noise_std=1., 
    cov_type='isotropic'
)
mixtureset = beer.MixtureSet.create(nmixtures, modelset)
m_gmm_iso = beer.Mixture.create(mixtureset)

# Diagonal covariance.
modelset = beer.NormalSet.create(
    data_mean, data_var, 
    size=total_components,
    prior_strength=1., 
    noise_std=1., 
    cov_type='diagonal'
)
mixtureset = beer.MixtureSet.create(nmixtures, modelset)
m_gmm_diag = beer.Mixture.create(mixtureset)

# Full covariance.
modelset = beer.NormalSet.create(
    data_mean, data_var,
    size=total_components,
    prior_strength=1.,
    noise_std=1., 
    cov_type='full'
)
mixtureset = beer.MixtureSet.create(nmixtures, modelset)
m_gmm_full = beer.Mixture.create(mixtureset)


models = {
    'm_gmm_iso': m_gmm_iso,
    'm_gmm_diag': m_gmm_diag,
    'm_gmm_full': m_gmm_full
}

In [5]:
print(m_gmm_iso)

Mixture(
  (modelset): MixtureSet(
    (categoricalset): CategoricalSet(
      (weights): ConjugateBayesianParameter(prior=Dirichlet, posterior=Dirichlet)
    )
    (modelset): NormalSet(
      (means_precisions): ConjugateBayesianParameter(prior=IsotropicNormalGamma, posterior=IsotropicNormalGamma)
    )
  )
  (categorical): Categorical(
    (weights): ConjugateBayesianParameter(prior=Dirichlet, posterior=Dirichlet)
  )
)


## Variational Bayes Training 

In [6]:
epochs = 200
lrate = 1.
X = torch.from_numpy(data).float()

optims = {
    model_name: beer.VBConjugateOptimizer(
        model.mean_field_factorization(), 
        lrate
    )
    for model_name, model in models.items()
}

elbos = {
    model_name: [] 
    for model_name in models
}  
    
for epoch in range(epochs):
    for name, model in models.items():
        optim = optims[name]
        optim.init_step()
        elbo = beer.evidence_lower_bound(model, X, datasize=len(X))
        elbo.backward()
        elbos[name].append(float(elbo) / len(X))
        optim.step()

In [7]:
colors = {
    'm_gmm_iso': 'green',
    'm_gmm_diag': 'blue',
    'm_gmm_full': 'red',
    'm_gmm_iso_shared': 'grey',
    'm_gmm_diag_shared': 'brown',
    'm_gmm_full_shared': 'black'
}
# Plot the ELBO.
fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
for model_name, elbo in elbos.items():
    fig.line(range(len(elbo)), elbo, legend=model_name, color=colors[model_name])
fig.legend.location = 'bottom_right'

show(fig)

In [8]:
figs = []
for i, model_name in enumerate(models):
    fig = figure(title=model_name, x_range=global_range, y_range=global_range,
                  width=250, height=250)
    model = models[model_name]
    weights = model.categorical.mean
    for j, gmm in enumerate(model.modelset):
        fig.circle(data[:, 0], data[:, 1], alpha=.1)
        plotting.plot_gmm(fig, gmm, alpha=weights[j].numpy())
    if i % 3 == 0:
        figs.append([])
    figs[-1].append(fig)
grid = gridplot(figs)
show(grid)

## Hierarchical Dirichlet Process Mixture Model



In [227]:
import pickle 
with open('/home/lucas/Desktop/test.mdl', 'rb') as f:
    ploop = pickle.load(f)
    
sb_categoricalset = beer.SBCategoricalSet.create(len(ploop.start_pdf), ploop.categorical, prior_strength=1)
sb_categoricalset.mean_field_factorization()
sb_categoricalset.stickbreaking

ConjugateBayesianParameter(prior=Dirichlet, posterior=Dirichlet)

In [228]:
ploop.categorical.mean

tensor([8.4794e-02, 6.8245e-02, 5.9572e-02, 5.3810e-02, 4.9531e-02, 4.5954e-02,
        4.2675e-02, 3.9621e-02, 3.6804e-02, 3.4056e-02, 3.1768e-02, 2.9965e-02,
        2.8356e-02, 2.7020e-02, 2.5932e-02, 2.4905e-02, 2.4000e-02, 2.3210e-02,
        2.2438e-02, 2.1641e-02, 2.0904e-02, 2.0179e-02, 1.9428e-02, 1.8530e-02,
        1.7518e-02, 1.6477e-02, 1.5243e-02, 1.3951e-02, 1.2607e-02, 1.1147e-02,
        9.8267e-03, 8.5810e-03, 7.4084e-03, 6.3655e-03, 5.4483e-03, 4.6224e-03,
        3.8511e-03, 3.1464e-03, 2.5248e-03, 1.9762e-03, 1.5367e-03, 1.1640e-03,
        8.9237e-04, 6.6523e-04, 4.6442e-04, 3.0076e-04, 2.0410e-04, 1.2422e-04,
        8.2237e-05, 4.6845e-05, 2.8373e-05, 1.8312e-05, 9.2043e-06, 5.7885e-06,
        4.8481e-06, 3.9383e-06, 3.3966e-06, 2.9706e-06, 2.8055e-06, 2.7001e-06,
        2.6145e-06, 2.5215e-06, 2.4651e-06, 2.4145e-06, 2.3644e-06, 2.3170e-06,
        2.2709e-06, 2.2260e-06, 2.1822e-06, 2.1393e-06, 2.0973e-06, 2.0562e-06,
        2.0159e-06, 1.9763e-06, 1.9376e-

In [218]:
mean[1], cmean[1]

(array([8.61596548e-006, 4.81593179e-007, 5.63082438e-008, 9.25023441e-009,
        1.84539957e-009, 3.81240394e-010, 7.12392108e-011, 1.16507117e-011,
        1.67977571e-012, 1.86819975e-013, 2.24481638e-014, 3.37067856e-015,
        5.06011142e-016, 8.82834017e-017, 1.86643520e-017, 3.79700808e-018,
        8.34534920e-019, 2.01634388e-019, 4.57658196e-020, 8.85935859e-021,
        1.73451477e-021, 3.10244227e-022, 4.57219121e-023, 3.76939999e-024,
        1.66482282e-025, 4.51133237e-027, 3.30566788e-029, 7.57021960e-032,
        3.63127010e-035, 1.11137993e-039, 6.47749331e-045, 2.48101080e-051,
        2.41776757e-059, 6.00272969e-069, 1.96000117e-080, 1.12169959e-094,
        1.70291767e-113, 9.42835740e-139, 9.74571803e-173, 1.72182973e-220,
        2.47772272e-283, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00

In [229]:
from scipy.special import gamma


def beta(x, a, b):
    norm = gamma(a + b) / (gamma(a) * gamma(b))
    return (x ** (a - 1) * (1 - x)**(b-1)) * norm

x = np.linspace(1e-3, 0.999, 1000)
mean = ploop.categorical.mean.numpy()
cmean = mean.cumsum()
concentration = 1

fig = figure(y_range=(0, 1))

for i in range(10):
    p_x = beta(x, concentration * mean[i], concentration * (1 - cmean[i]))
    if i == 0:
        fig.line(x, p_x, color='red')
    else:
        fig.line(x, p_x)
show(fig)


In [None]:
def hdp_sb(mean, concentration):
    cmean = np.cumsum(mean)
    v = np.array([np.random.beta(concentration * mean[i], concentration * (1 - cmean[i]))
                  for i in range(101)])
    residual = np.cumprod(1 - v)
    pi = v
    pi[1:] *= residual[:-1]
    return pi

samples = np.c_[[hdp_sb(mean, concentration=.1) for i in range(50)]]

fig = figure()


for sample in samples:
    fig.line(range(101), sample, alpha=.3)
fig.line(range(101), mean, color='red')
fig.line(range(101), samples.mean(axis=0), color='green')

show(fig)

In [224]:
sb_categoricalset = beer.SBCategoricalSet.create(len(ploop.start_pdf), ploop.categorical, prior_strength=1)
vbinit = sb_categoricalset.mean.numpy()

fig = figure()
fig.line(range(101), mean, color='red')
fig.line(range(101), samples.mean(axis=0), color='green')
fig.line(range(101), vbinit.mean(axis=0), color='blue')

show(fig)

RuntimeError: Columns need to be 1D (y is not)

In [225]:
bploop = beer.BigramPhoneLoop.create(ploop.graph, ploop.start_pdf, ploop.end_pdf,
                                     ploop.modelset, sb_categoricalset).double()

X = torch.from_numpy(np.load('/home/lucas/Desktop/mzmb0_sx176.npy'))
model = bploop
epochs = 10
optim =  beer.VBConjugateOptimizer(model.mean_field_factorization(), lrate=1)
elbos = []
    
for epoch in range(epochs):
    optim.init_step()
    elbo = beer.evidence_lower_bound(model, X)
    elbo.backward()
    elbos.append(float(elbo) / len(X))
    optim.step()
    print(elbos[-1])

-323.0485620390909
-62.44699905573244
-62.434973057300446
-62.43495835227181
-62.434914806861045
-62.434653179197646
-62.43129272985406
-62.4209929334459
-62.42076006240268
-62.42076006170522


In [226]:
mean = bploop.categoricalset.mean.numpy()
print(mean.sum())

fig = figure(x_range=(0, 101), y_range=(0, 101))
fig.image(image=[mean], x=0, y=0, dh=101, dw=101)
show(fig)

14.472890411808521


In [213]:
mean[0]

array([3.16963652e-006, 1.77168232e-007, 2.07146452e-008, 3.40297126e-009,
       6.78884574e-010, 1.40250512e-010, 2.62074401e-011, 4.28605689e-012,
       6.17954930e-013, 6.87272240e-014, 8.25821828e-015, 1.24000331e-015,
       3.85082206e-001, 3.24776460e-017, 6.86623085e-018, 1.39684110e-018,
       3.07008216e-019, 7.41771402e-020, 1.68363028e-020, 3.25917564e-021,
       6.38092274e-022, 1.14132464e-022, 1.68201502e-023, 1.38668466e-024,
       6.12454042e-026, 1.65962631e-027, 1.21608716e-029, 2.78492794e-032,
       1.33586951e-035, 4.08853797e-040, 2.38293644e-045, 9.12712795e-052,
       8.89446913e-060, 2.20828067e-069, 7.21044078e-081, 4.12650185e-095,
       6.26468351e-114, 3.46849858e-139, 3.58524903e-173, 6.33425712e-221,
       9.11503179e-284, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0

In [123]:
mean = ploop.categorical.mean.repeat(101, 1).numpy()
print(mean.sum())

fig = figure(x_range=(0, 101), y_range=(0, 101))
fig.image(image=[mean], x=0, y=0, dh=101, dw=101)
show(fig)

100.96655


100.96655

In [73]:
x = torch.tensor([1, 2, 3])
x.repeat(4, 2)

tensor([[1, 2, 3, 1, 2, 3],
        [1, 2, 3, 1, 2, 3],
        [1, 2, 3, 1, 2, 3],
        [1, 2, 3, 1, 2, 3]])