# VAE - Gaussian Linear Classifier

This notebook illustrate how to combine a Variational AutoEncoder (VAE) and a Gaussian Linear Classifier (GLC) with the [beer framework](https://github.com/beer-asr/beer).

In [None]:
%load_ext autoreload
%autoreload 2

# Add the path of the beer source code ot the PYTHONPATH.
import sys
sys.path.insert(0, '../')

import math
import yaml
import numpy as np
import torch
import torch.optim
from torch import nn



# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d

# Beer framework
import beer

# Convenience functions for plotting.
import plotting

output_notebook(verbose=False)

## Data 

As a simple example we consider the following synthetic data: 

In [None]:
ntargets = 5
N = 100
Xs = []
labels = []
for i in range(ntargets):
    mean = np.array([0, 2. - (i * 1.5)])
    cov = np.array([[.75, 0.], [0., .075]])
    #mean = np.array([0, 0])
    #cov = np.eye(2)
    Z1 = np.random.multivariate_normal(mean, cov, size=N)
    #X1 = Z1
    X1 = np.zeros_like(Z1)
    X1[:, 0] = Z1[:, 0]
    X1[:, 1] = Z1[:, 1] + (Z1[:, 0]-mean[0])** 2
    labels.append(np.ones(len(X1)) * i)
    Xs.append(X1)

idxs = np.arange(0, ntargets * N)
#np.random.shuffle(idxs)
data = np.vstack(Xs)[idxs]
labels = np.hstack(labels)[idxs]
#data = (data - data.mean(axis=0)) / np.sqrt(data.var(axis=0))


#test_data = data[-100:]
#test_labels = labels[-100:]
#data = data[:-100]
#labels = labels[:-100]


fig = figure(title='Synthetic data', width=400, height=400)
colors = ['salmon', 'blue', 'green', 'orange', 'black', 'red', 'cyan', 'purple', 'brown', 'pink']
for sX, color in zip(Xs, colors):
    fig.circle(sX[:, 0], sX[:, 1], color=color)
show(fig)

## Model Creation

We first create the VAE-GLC.

#### NOTE:
To obtain a Gaussian Quadratic Classifier, us a GMM model with individual (diagonal) covariance matrix.

In [None]:
latent_space_dim = 2

# Shared full covariance.
modelset = beer.NormalSet.create(
    torch.zeros(latent_space_dim), 
    torch.ones(latent_space_dim), 
    size=ntargets,
    prior_strength=1., 
    noise_std=0.,
    cov_type='full', 
    shared_cov=True
)
prior_model = beer.Mixture.create(modelset)
#prior_model = beer.Normal.create(
#    torch.zeros(latent_space_dim), 
#    torch.ones(latent_space_dim),
#    cov_type='full', 
#)

encoder = torch.nn.Sequential(
    torch.nn.Linear(2, 50),
    torch.nn.Tanh(),
    torch.nn.Linear(50, 50),
    torch.nn.Tanh(),
)

nflow_steps = 5
nnet_flow = []
for i in range(nflow_steps):
    nnet_flow.append(beer.nnet.AutoRegressiveNetwork(
            dim_in=latent_space_dim, 
            flow_params_dim=10, 
            depth=2,
            width=20,
            activation=torch.nn.Softplus
        )
    )
normal_layer = beer.nnet.NormalIsotropicCovarianceLayer(50, latent_space_dim)
encoder_problayer = beer.nnet.InverseAutoRegressiveFlow(
    dim_in=50,
    flow_params_dim=10,
    normal_layer=normal_layer,
    nnet_flow=nnet_flow
)

decoder = torch.nn.Sequential(
    torch.nn.Linear(latent_space_dim, 50),
    torch.nn.Tanh(),
    torch.nn.Linear(50, 50),
    torch.nn.Tanh(),
    torch.nn.Linear(50, 2)
)

#decoder_problayer = beer.nnet.NormalIsotropicCovarianceLayer(10, 2)
data_mean = torch.from_numpy(data.mean(axis=0)).float()
data_var = torch.from_numpy(data.var(axis=0)).float()
normal_iso = beer.Normal.create(data_mean, data_var, 1., cov_type='isotropic')

model = beer.VAEGlobalMeanVariance(
    encoder, encoder_problayer,
    decoder, normal_iso,
    prior_model
)

### 1. Training

In [None]:
npoints = N * ntargets
epochs = 5_000
lrate_bayesmodel = 1e-2
lrate_encoder = 1e-3
X = torch.from_numpy(data[:npoints]).float()
targets = torch.from_numpy(labels[:npoints]).long()

nnet_parameters = list(model.encoder.parameters()) + \
    list(model.encoder_problayer.parameters()) + \
    list(model.decoder.parameters()) 
std_optimizer = torch.optim.Adam(nnet_parameters, lr=lrate_encoder, weight_decay=1e-2)
optimizer = beer.BayesianModelCoordinateAscentOptimizer(
    model.mean_field_groups, 
    lrate=lrate_bayesmodel, 
    std_optim=std_optimizer
)

elbos = []
for epoch in range(epochs):
    optimizer.zero_grad()
    elbo = beer.evidence_lower_bound(model, X, labels=targets, datasize=len(X))
    elbo.backward()
    elbo.natural_backward()
    optimizer.step()
    
    if epoch > 0:
        elbos.append(float(elbo) / len(X))

# Plot the ELBO.
#fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
#              y_axis_label='ln p(X)')
#fig.line(np.arange(len(elbos)), elbos, color='blue')

#show(fig)

In [None]:
# Plot the ELBO.
fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
fig.line(np.arange(len(elbos)), elbos, color='blue')

show(fig)

In [None]:
fig = figure(width=400, height=400)
for class_X, color in zip(Xs, colors):
    class_X = torch.from_numpy(class_X).float()
    enc_states = model.encoder(class_X)
    post_params = model.encoder_problayer(enc_states)
    samples, _ = model.encoder_problayer.samples_and_llh(post_params, use_mean=True)
    samples = samples.data.numpy()
    fig.circle(samples[:, 0], samples[:, 1], color=color)
plotting.plot_gmm(fig, model.latent_model, colors=colors, alpha=.5, color='blue')
show(fig)