# VAE - Gaussian Linear Classifier

This notebook illustrate how to combine a Variational AutoEncoder (VAE) and a Gaussian Linear Classifier (GLC) with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
%load_ext autoreload
%autoreload 2

# Add the path of the beer source code ot the PYTHONPATH.
from collections import defaultdict
import random
import sys
sys.path.insert(0, '../')

import math
import yaml
import numpy as np
import torch
import torch.optim
from torch import nn



# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d

# Beer framework
import beer

# Convenience functions for plotting.
import plotting

output_notebook(verbose=False)

## Data 

As a simple example we consider the following synthetic data: 

In [2]:
def generate_data(npoints=100):
    cov1 = np.array([[1, .75], [.75, 1]])
    m1 = np.array([0, -2])
    x1 = np.random.multivariate_normal(m1, cov1, size=npoints // 2)
    cov2 = np.array([[1, -.75], [-.75, 1]])
    m2 = np.array([0, 2])
    x2 = np.random.multivariate_normal(m2, cov2, size=npoints // 2 + npoints % 2)
    idxs = np.arange(0, npoints)
    np.random.shuffle(idxs)
    data = np.vstack([x1, x2])[idxs]
    return data

data = generate_data()
test_data = generate_data()

# Convert the data/labels to torch tensorl 
X = torch.from_numpy(data).float()
test_X = torch.from_numpy(test_data).float()

# Compute the global mean/variance to initialize the models.
global_mean, global_var = X.mean(dim=0), X.var(dim=0)

In [3]:
x_range, y_range = (-5, 5), (-5, 5)

fig1 = figure(title='Training data', width=400, height=400, x_range=x_range,
              y_range=y_range)
fig1.circle(data[:, 0], data[:, 1])
    
fig2 = figure(title='Test data', width=400, height=400, x_range=x_range,
              y_range=y_range)
fig2.circle(test_data[:, 0], test_data[:, 1])

show(gridplot([[fig1, fig2]]))

## Model Creation

We first create the VAE-GLC.

#### NOTE:
To obtain a Gaussian Quadratic Classifier, us a GMM model with individual (diagonal) covariance matrix.

In [4]:
def create_vae(mean, var, nnet_width=50, nflow_width=20, nflow_depth=0, 
               nflow_block_depth=2, nflow_params_dim=10, latent_space_dim=2, 
               p_strength=1.):
    
    obs_space_dim = len(mean)
    # Normal prior.
    prior = beer.Normal.create(mean=torch.zeros(latent_space_dim), 
                               cov=torch.ones(latent_space_dim),
                               cov_type='full')

    # Encoder network.
    encoder = torch.nn.Sequential(
        torch.nn.Linear(obs_space_dim, nnet_width),
        torch.nn.ELU(),
        torch.nn.Linear(nnet_width, nnet_width),
        torch.nn.ELU()
    )
    
    # Normalizing flow (1): Initial distribution
    normal_layer = beer.nnet.NormalIsotropicCovarianceLayer(nnet_width, 
                                                            latent_space_dim)

    # Normalizing flow (2): sequence of autogressive network.
    nflow_steps = 0
    nnet_flow = []
    for i in range(nflow_steps):
        nnet_flow.append(beer.nnet.AutoRegressiveNetwork(
                dim_in=latent_space_dim, 
                flow_params_dim=nflow_params_dim, 
                depth=nflow_block_depth,
                width=nflow_width,
                activation=torch.nn.ELU()
            )
        )
    
    # Normalizing flow (3): Assemble the initial distribution and the 
    #                       autoregressive nnets.
    encoder_problayer = beer.nnet.InverseAutoRegressiveFlow(
        dim_in=nnet_width,
        flow_params_dim=nflow_params_dim,
        normal_layer=normal_layer,
        nnet_flow=nnet_flow
    )

    # Decoder network
    decoder = torch.nn.Sequential(
        torch.nn.Linear(latent_space_dim, nnet_width),
        torch.nn.Tanh(),
        torch.nn.Linear(nnet_width, nnet_width),
        torch.nn.Tanh(),
        torch.nn.Linear(nnet_width, obs_space_dim)
    )

    # Normal distribution embedding the auto-encoder:
    #   N(μ + f(z), σ²)
    # note that the variance does not depends on the latent space as 
    # it usually does with standard variational auto-encoder.
    normal_iso = beer.Normal.create(mean, var, 1., cov_type='isotropic')

    # Constructre the VAE from all the part.
    model = beer.VAEGlobalMeanVariance(encoder, encoder_problayer,decoder, 
                                       normal_iso, prior)
    return model

    
def create_gaussian_classifier(ntargets, mean, var, p_strength=1., 
                               shared_cov=True):
    mset = beer.NormalSet.create(mean, var, size=ntargets, cov_type='full', 
                          shared_cov=shared_cov, prior_strength=p_strength,
                                noise_std=0)
    return beer.Mixture.create(mset)

### 1. Pre-training

In [5]:
def train_cvb(model, X, epochs=1, nbatches=1, lrate_nnet=1e-3,
              update_prior=True, update_nnet=True, kl_weight=1., state=None,
              callback=None):
    
    batches = X.view(nbatches, -1, 2)

    prior_parameters = model.bayesian_parameters() if update_prior else model.normal.bayesian_parameters()
    nnet_parameters = model.modules_parameters() if update_nnet else range(0)
    
    if state is None:
        std_optimizer = torch.optim.Adam(nnet_parameters, lr=lrate_nnet, 
                                         weight_decay=1e-2)
        optimizer = beer.CVBOptimizer(prior_parameters, std_optim=std_optimizer)
        batch_stats = defaultdict(lambda: defaultdict(lambda: None))
    else:
        optimizer, batch_stats = state

    for epoch in range(epochs):
        # Randomized the order of the batches.
        batch_ids = list(range(len(batches)))
        random.shuffle(batch_ids)
        
        for batch_id in batch_ids:
            optimizer.init_step(batch_stats[batch_id])
            kwargs = {'kl_weight': kl_weight}
            elbo = beer.collapsed_evidence_lower_bound(model, batches[batch_id], 
                                                       **kwargs)
            batch_stats[batch_id] = elbo.backward()
            optimizer.step()
            
        if callback is not None:
            callback()
        # Monitor the evidence lower bound after each epoch.
        #elbo = beer.evidence_lower_bound(model, X, **kwargs)
        #elbos.append(float(elbo) / len(X))
    
    return (optimizer, batch_stats)


def train_svb(model, X, epochs=1, nbatches=1, lrate_nnet=1e-3,
              lrate_prior=1e-1, update_prior=True, update_nnet=True, 
              kl_weight=1., state=None, callback=None):
    
    batches = X.view(nbatches, -1, 2)
    
    mf_groups = model.mean_field_groups if update_prior else model.normal.mean_field_groups
    nnet_parameters = model.modules_parameters() if update_nnet else range(0)

    if state is None:
        std_optimizer = torch.optim.Adam(nnet_parameters, lr=lrate_nnet, 
                                         weight_decay=1e-2)
        optimizer = beer.BayesianModelOptimizer(mf_groups, lrate=lrate_prior, 
                                                std_optim=std_optimizer)
    else:
        optimizer = state
    
    for epoch in range(epochs):
        # Randomized the order of the batches.
        batch_ids = list(range(len(batches)))
        random.shuffle(batch_ids)
        for batch_id in batch_ids:
            optimizer.init_step()
            kwargs = {'kl_weight': kl_weight, 'datasize': len(X)}
            elbo = beer.evidence_lower_bound(model, batches[batch_id], 
                                             **kwargs)
            elbo.backward()
            optimizer.step()
        
        if callback is not None:
            callback()
            
        # Monitor the evidence lower bound after each epoch.
        #elbo = beer.evidence_lower_bound(model, X, **kwargs)
        #elbos.append(float(elbo) / len(X))
    
    return optimizer

In [6]:
def plot_latent_space(fig, model, X, use_mean=True):
    enc_states = vae.encoder(X)
    post_params = vae.encoder_problayer(enc_states)
    samples, _ = vae.encoder_problayer.samples_and_llh(post_params, use_mean=use_mean)
    samples = samples.data.numpy()
    fig.circle(samples[:, 0], samples[:, 1])
    
def plot_density(fig, model, x_range, y_range, nsamples=10):
    xy = np.mgrid[x_range[0]:x_range[1]:100j, y_range[0]:y_range[1]:100j].reshape(2,-1).T
    xy = torch.from_numpy(xy).float()
    
    mllhs = []
    for i in range(nsamples):
        mllhs.append(model.marginal_log_likelihood(xy, use_mean=False).view(-1, 1))
    mllhs = torch.cat(mllhs, dim=-1).mean(dim=-1)
    mllhs = mllhs.detach().numpy().reshape(100, 100)
    mlhs = np.exp(mllhs)
    width, height = x_range[1] - x_range[0] / 100, y_range[1] - y_range[0] / 100
    fig.image(image=[mlhs.T], x=x_range[0], y=y_range[0], dw=2 * width, dh=2 * height)

In [7]:
vae = create_vae(global_mean, global_var)

svb_elbos = []
svb_elbos2 = []
svb_elbos_test = []
def log_pred():
    elbo = beer.evidence_lower_bound(vae, X)
    svb_elbos.append(float(elbo) / len(X))
    elbo = beer.collapsed_evidence_lower_bound(vae, X)
    svb_elbos2.append(float(elbo) / len(X))
    elbo = beer.collapsed_evidence_lower_bound(vae, test_X)
    svb_elbos_test.append(float(elbo) / len(test_X))
    
# training the vae.
state = train_svb(vae, X, epochs=2000, nbatches=10, callback=log_pred, update_prior=True)

# Plotting
fig1 = figure(width=300, height=300)
fig1.line(range(len(svb_elbos)), svb_elbos, legend='ELBO')
fig1.legend.location = 'bottom_right'

fig2 = figure(width=300, height=300, x_range=(-7, 7), y_range=(-7, 7))
mean, cov = vae.latent_model.mean, vae.latent_model.cov
plotting.plot_normal(fig2, mean.numpy(), cov.numpy(),alpha=.1)
plot_latent_space(fig2, vae, X)

fig3 = figure(width=300, height=300, x_range=x_range, y_range=y_range)
plot_density(fig3, vae, x_range, y_range, nsamples=100)

show(gridplot([[fig1, fig2, fig3]]))

In [8]:
vae = create_vae(global_mean, global_var)

cvb_elbos_test = []
cvb_elbos = []
cvb_elbos2 = []
def log_pred():
    elbo = beer.evidence_lower_bound(vae, X)
    cvb_elbos.append(float(elbo) / len(test_X))
    elbo = beer.collapsed_evidence_lower_bound(vae, X)
    cvb_elbos2.append(float(elbo) / len(test_X))
    elbo = beer.collapsed_evidence_lower_bound(vae, test_X)
    cvb_elbos_test.append(float(elbo) / len(test_X))

# training the vae.
state = train_cvb(vae, X, epochs=2000, nbatches=10, callback=log_pred, update_prior=True)

# Plotting
fig1 = figure(width=300, height=300)
fig1.line(range(len(cvb_elbos)), cvb_elbos, legend='ELBO')
fig1.legend.location = 'bottom_right'

fig2 = figure(width=300, height=300, x_range=(-7, 7), y_range=(-7, 7))
mean, cov = vae.latent_model.mean, vae.latent_model.cov
plotting.plot_normal(fig2, mean.numpy(), cov.numpy(),alpha=.1)
plot_latent_space(fig2, vae, X, use_mean=True)

fig3 = figure(width=300, height=300, x_range=x_range, y_range=y_range)
plot_density(fig3, vae, x_range, y_range, nsamples=100)

show(gridplot([[fig1, fig2, fig3]]))

In [9]:
# Plotting

fig1 = figure(title='ELBO (train set)', width=300, height=300, y_range=(-5, 1))
fig1.line(range(len(svb_elbos2)), svb_elbos, color='blue', legend='SVB')
fig1.line(range(len(cvb_elbos2)), cvb_elbos, color='green', legend='CVB')
fig1.legend.location = 'bottom_right'

fig2 = figure(title='Col. ELBO (train set)', width=300, height=300, y_range=(-5, 1))
fig2.line(range(len(svb_elbos2)), svb_elbos2, color='blue', legend='SVB')
fig2.line(range(len(cvb_elbos2)), cvb_elbos2, color='green', legend='CVB')
fig2.legend.location = 'bottom_right'

fig3 = figure(title='log pred. (test set)', width=300, height=300, y_range=(-5, 1))
fig3.line(range(len(svb_elbos_test)), svb_elbos_test, color='blue', legend='SVB')
fig3.line(range(len(cvb_elbos_test)), cvb_elbos_test, color='green', legend='CVB')
fig3.legend.location = 'bottom_right'

show(gridplot([[fig1, fig2, fig3]]))