# Variational AutoEncoder

This notebook illustrate how to build and train a Variation AutoEncoder with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
# Add "beer" to the PYTHONPATH
import sys
sys.path.insert(0, '../')

import copy

import beer
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d
output_notebook()

# Convenience functions for plotting.
import plotting

%load_ext autoreload
%autoreload 2

## Data
As an illustration, we generate a synthetic data set composed of two Normal distributed cluster.
One has a diagonal covariance matrix whereas the other has a dense covariance matrix.
Those two clusters overlap so it is reasonable to map all the data to a single Gaussian in the latent space.

In [2]:
# First cluster.
mean = np.array([-3, 3]) 
cov = np.array([[1, -1], [-1, 2.]])
data1 = np.random.multivariate_normal(mean, cov, size=100)

# Second cluster.
mean = np.array([3, 2.5]) 
cov = np.array([[2, 1], [1, .75]])
data2 = np.random.multivariate_normal(mean, cov, size=100)

data = np.vstack([data1, data2]) 

np.random.shuffle(data)

In [3]:
# Mean, variance of the data to scale the figure.
mean = data.mean(axis=0)
var = data.var(axis=0)
std_dev = np.sqrt(max(var))
x_range = (mean[0] - 3 * std_dev, mean[0] + 3 * std_dev)
y_range = (mean[1] - 3 * std_dev, mean[1] + 3 * std_dev)
global_range = (min(x_range[0], y_range[0]), max(x_range[1], y_range[1]))

fig = figure(title='Data', width=400, height=400,
             x_range=global_range, y_range=global_range)
fig.circle(data[:, 0], data[:, 1])

show(fig)

In [4]:
# Generate some test data

# First cluster.
mean = np.array([-3, 3]) 
cov = np.array([[1, -1], [-1, 2.]])
test_data1 = np.random.multivariate_normal(mean, cov, size=100)

# Second cluster.
mean = np.array([3, 2.5]) 
cov = np.array([[2, 1], [1, .75]])
test_data2 = np.random.multivariate_normal(mean, cov, size=100)

test_data = np.vstack([test_data1, test_data2])

fig = figure(title='Data', width=400, height=400,
             x_range=global_range, y_range=global_range)
fig.circle(test_data[:, 0], test_data[:, 1])

show(fig)

In [5]:
X = torch.from_numpy(data).float()
test_X = torch.from_numpy(test_data).float()

In [6]:
observed_dim = 2
latent_dim = 2

In [7]:
hidden_dim = 50

class GaussianMLP(nn.Module):
    def __init__(self, in_dim, out_dim, hidden_dim):
        super().__init__()
        self.i2h = torch.nn.Linear(in_dim, hidden_dim)
        self.h2mean = nn.Linear(hidden_dim, out_dim)
        self.h2logvar = nn.Linear(hidden_dim, out_dim)
        
    def forward(self, X):
        h = F.tanh(self.i2h(X))
        mean = self.h2mean(h)
        logvar = self.h2logvar(h)
        return mean, logvar.exp()
    
class GaussianMLP2(nn.Module):
    def __init__(self, in_dim, out_dim, hidden_dim):
        super().__init__()
        self.i2h = torch.nn.Linear(in_dim, hidden_dim)
        self.h2mean = nn.Linear(hidden_dim, out_dim)
        
    def forward(self, X):
        h = F.tanh(self.i2h(X))
        mean = self.h2mean(h)
        return mean

In [1]:
latent_normal1 = beer.NormalDiagonalCovariance.create(
    torch.zeros(latent_dim), torch.ones(latent_dim)
)
vae1 = beer.VAE(
    GaussianMLP(observed_dim, latent_dim, hidden_dim), 
    GaussianMLP(latent_dim, observed_dim, hidden_dim), 
    latent_normal1
)

latent_normal2 = beer.NormalDiagonalCovariance.create(
    torch.zeros(latent_dim), torch.ones(latent_dim)
)
vae2 = beer.VAEGlobalMeanVar.create(
    torch.from_numpy(data.mean(axis=0)).float(),
    torch.from_numpy(data.var(axis=0)).float(),
    GaussianMLP(observed_dim, latent_dim, hidden_dim), 
    GaussianMLP2(latent_dim, observed_dim, hidden_dim), 
    latent_normal2, 
    pseudo_counts=1
)

models = [vae1, vae2]

NameError: name 'beer' is not defined

In [9]:
epochs = 20_000
lrate = 1.
lrate_nnet = 1e-3

# Number of samples to estimate the expectation of the log-likelihood.
nsamples = 5

nnet_parameters = list(vae1.encoder.parameters()) + list(vae1.decoder.parameters())
nnet_parameters += list(vae2.encoder.parameters()) + list(vae2.decoder.parameters())
nnet_optim = torch.optim.Adam(nnet_parameters, lr=1e-3)
params = vae2.parameters + vae1.parameters
optim = beer.BayesianModelOptimizer(params, lrate=lrate, std_optim=nnet_optim)

elbos = [[], []]
for i in range(epochs):
    for i, model in enumerate(models):
        optim.zero_grad()
        elbo = beer.evidence_lower_bound(model, X, datasize=len(X), 
                                         nsamples=nsamples, kl_weight=0.)
        elbo.backward()
        elbo.natural_backward()
        optim.step()
        elbos[i].append(float(elbo) / len(X))
        

fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
fig.line(np.arange(len(elbos[0])), elbos[0], legend='ELBO (VAE 1)', color='blue')
fig.line(np.arange(len(elbos[1])), elbos[1], legend='ELBO (VAE 2)', color='red')
fig.legend.location = 'bottom_right'

show(fig)

In [10]:
resolution=0.1
xy = np.mgrid[global_range[0]:global_range[1]:resolution, global_range[0]:global_range[1]:resolution].reshape(2,-1).T
xy = torch.from_numpy(xy).float()
import math
single_dim_nb_points = int(math.sqrt(xy.shape[0]))

# For a smooth plot increase the number of samples.
elbos = vae1(xy, nsamples=200) - vae1.local_kl_div_posterior_prior() - \
    vae1.kl_div_posterior_prior()

p_x_mtx = elbos.view(single_dim_nb_points,single_dim_nb_points).t().exp()
p_x_mtx = p_x_mtx.data.numpy()
fig = figure(title='p(X)', width=400, height=400,
             x_range=global_range, y_range=global_range)

plane_size = global_range[1] - global_range[0]
fig.image(image=[p_x_mtx], x=global_range[0], y=global_range[0], dw=plane_size, dh=plane_size)

show(fig)

In [11]:
resolution=0.1
xy = np.mgrid[global_range[0]:global_range[1]:resolution, global_range[0]:global_range[1]:resolution].reshape(2,-1).T
xy = torch.from_numpy(xy).float()
import math
single_dim_nb_points = int(math.sqrt(xy.shape[0]))

# For a smooth plot increase the number of samples.
elbos = vae2(xy, nsamples=200) - vae2.local_kl_div_posterior_prior() - \
    vae2.kl_div_posterior_prior()

p_x_mtx = elbos.view(single_dim_nb_points,single_dim_nb_points).t().exp()
p_x_mtx = p_x_mtx.data.numpy()
fig = figure(title='p(X)', width=400, height=400,
             x_range=global_range, y_range=global_range)
plane_size = global_range[1] - global_range[0]
fig.image(image=[p_x_mtx], x=global_range[0], y=global_range[0], dw=plane_size, dh=plane_size)


show(fig)