# VAE - Gaussian Linear Classifier

This notebook illustrate how to combine a Variational AutoEncoder (VAE) and a Gaussian Linear Classifier (GLC) with the [beer framework](https://github.com/beer-asr/beer).

In [22]:
%load_ext autoreload
%autoreload 2

# Add the path of the beer source code ot the PYTHONPATH.
import sys
sys.path.insert(0, '../')

import math
import yaml
import numpy as np
import torch
import torch.optim
from torch import nn



# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d

# Beer framework
import beer

# Convenience functions for plotting.
import plotting

output_notebook(verbose=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data 

As a simple example we consider the following synthetic data: 

In [23]:
ntargets = 5
N = 100
Xs = []
labels = []
for i in range(ntargets):
    mean = np.array([0, 2. - (i * 1.5)])
    cov = np.array([[.75, 0.], [0., .075]])
    Z1 = np.random.multivariate_normal(mean, cov, size=N)
    X1 = np.zeros_like(Z1)
    X1[:, 0] = Z1[:, 0]
    X1[:, 1] = Z1[:, 1] + (Z1[:, 0]-mean[0])** 2
    labels.append(np.ones(len(X1)//2) * i)
    labels.append(np.ones(len(X1) // 2) * -1)
    Xs.append(X1)

idxs = np.arange(0, ntargets * N)
#np.random.shuffle(idxs)
data = np.vstack(Xs)[idxs]
labels = np.hstack(labels)[idxs]
#data = (data - data.mean(axis=0)) / np.sqrt(data.var(axis=0))


#test_data = data[-100:]
#test_labels = labels[-100:]
#data = data[:-100]
#labels = labels[:-100]


fig = figure(title='Synthetic data', width=400, height=400)
colors = ['salmon', 'blue', 'green', 'orange', 'black', 'red', 'cyan', 'purple', 'brown', 'pink']
for sX, color in zip(Xs, colors):
    fig.circle(sX[:, 0], sX[:, 1], color=color)
show(fig)

## Model Creation

We first create the VAE-GLC.

#### NOTE:
To obtain a Gaussian Quadratic Classifier, us a GMM model with individual (diagonal) covariance matrix.

In [24]:
conf_str = '''
type: NonLinearSubspaceModel
normal_model:
  type: Normal
  covariance: isotropic
  prior_strength: 1.
  noise_std: 0.
normalizing_flow:
  type: InverseAutoRegressive
  depth: 5
  iaf_block:
    activation: Softplus
    context_dim: 10
    data_dim: 2
    depth: 2
    width: 20
encoder:
  nnet_structure:
  - block_structure:
    - Linear:in_features=<feadim>;out_features=20
    - Softplus
    - Linear:in_features=20;out_features=20
    - Softplus
  prob_layer:
    type: NormalizingFlowLayer
    covariance: isotropic
    flow_params_dim: 10
    dim_in: 20
    dim_out: 2
decoder:
  nnet_structure:
  - block_structure:
    - Linear:in_features=2;out_features=20
    - Softplus
    - Linear:in_features=20;out_features=20
    - Softplus
  prob_layer:
    type: NormalLayer
    covariance: isotropic
    dim_in: 20
    dim_out: <feadim>
latent_model:
  latent_model:
  type: Mixture
  prior_strength: 1.
  components:
    type: PLDASet
    size: 5
    dim_noise_subspace: 1
    dim_class_subspace: 1
    prior_strength: 1.
    noise_std: 1.
'''

tmp = '''latent_model:
  type: Mixture
  prior_strength: 1.
  components:
    type: NormalSet
    size: 5
    covariance: diagonal
    shared_covariance: false
    prior_strength: 1.
    noise_std: 1.
'''

In [25]:
data_mean = torch.from_numpy(data.mean(axis=0)).float()
data_var = torch.from_numpy(np.var(data, axis=0)).float()
conf_data = conf_str.replace('<feadim>', str(len(data_mean)))
conf = yaml.load(conf_data)
model = beer.create_model(conf, data_mean, data_var)

## Variational Bayes Training

### 1. Pre-training

In [26]:
npoints = N * ntargets
epochs = 1_000
lrate_bayesmodel = 1.
lrate_encoder = 1e-3
X = torch.from_numpy(data[:npoints]).float()
targets = torch.from_numpy(labels[:npoints]).long()

nnet_parameters = list(model.encoder.parameters()) + \
    list(model.decoder.parameters()) + \
    list(model.nflow.parameters())
std_optimizer = torch.optim.Adam(nnet_parameters, lr=lrate_encoder, weight_decay=1e-2)
optimizer = beer.BayesianModelCoordinateAscentOptimizer(
    model.mean_field_groups, 
    lrate=lrate_bayesmodel, 
    std_optim=std_optimizer)
    
elbos = []
for epoch in range(epochs):
    optimizer.zero_grad()
    elbo = beer.evidence_lower_bound(model, X, datasize=len(X), 
                                     labels=targets, kl_weight=0.)
    elbo.backward()
    elbo.natural_backward()
    optimizer.step()
    
    if epoch > 0:
        elbos.append(float(elbo) / len(X))

# Plot the ELBO.
fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
fig.line(np.arange(len(elbos)), elbos, color='blue')

show(fig)

### 1. Training

In [None]:
npoints = N * ntargets
epochs = 2_000
lrate_bayesmodel = .1
lrate_encoder = 1e-3
X = torch.from_numpy(data[:npoints]).float()
targets = torch.from_numpy(labels[:npoints]).long()

nnet_parameters = list(model.encoder.parameters()) + \
    list(model.decoder.parameters()) + \
    list(model.nflow.parameters())
std_optimizer = torch.optim.Adam(nnet_parameters, lr=lrate_encoder, weight_decay=1e-2)
optimizer = beer.BayesianModelCoordinateAscentOptimizer(
    model.mean_field_groups, 
    lrate=lrate_bayesmodel, 
    std_optim=std_optimizer)

elbos = []
for epoch in range(epochs):
    optimizer.zero_grad()
    elbo = beer.evidence_lower_bound(model, X, labels=targets, datasize=len(X), kl_weight=1.)
    elbo.backward()
    elbo.natural_backward()
    optimizer.step()
    
    if epoch > 0:
        elbos.append(float(elbo) / len(X))

# Plot the ELBO.
#fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
#              y_axis_label='ln p(X)')
#fig.line(np.arange(len(elbos)), elbos, color='blue')

#show(fig)

In [None]:
# Plot the ELBO.
fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
fig.line(np.arange(len(elbos)), elbos, color='blue')

show(fig)

In [36]:
fig1 = figure(width=400, height=400)
fig2 = figure(width=400, height=400)
for class_X, color in zip(Xs, colors):
    class_X = torch.from_numpy(class_X).float()
    mean, variance, flow_params = model.encoder(class_X)
    _, samples = model.nflow(mean, variance, flow_params, use_mean=True)
    r_class_X, _ = model.decoder(samples)
    r_class_X += model.normal.mean
    samples = samples.data.numpy()
    class_X, r_class_X = class_X.detach().numpy(), r_class_X.detach().numpy()
    fig1.circle(class_X[:, 0], class_X[:, 1], alpha=.5, color=color)
    fig1.cross(r_class_X[:, 0], r_class_X[:, 1], color=color)
    fig2.circle(samples[:, 0], samples[:, 1], color=color)
        
plotting.plot_normal(fig1, model.normal.mean.numpy(), 
                     model.normal.cov.numpy(), alpha=.5)
plotting.plot_gmm(fig2, model.latent_model, colors=colors, alpha=.5, color='blue')
#show(fig1)
show(gridplot([[fig1, fig2]]))