# Discriminative Variational Model

This notebook illustrate how to build and train a Discriminative Variational Model (SVAE) with the [beer framework](https://github.com/beer-asr/beer).

In [1]:
%load_ext autoreload
%autoreload 2

# Add the path of the beer source code ot the PYTHONPATH.
import sys
sys.path.insert(0, '../')

import numpy as np
import torch
import torch.optim
from torch import nn
from torch.autograd import Variable


# For plotting.
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, gridplot
from bokeh.models import LinearAxis, Range1d

# Beer framework
import beer

# Convenience functions for plotting.
import plotting

output_notebook(verbose=False)

## Data 

As a simple example we consider the following synthetic data: 

$$ 
\begin{split}
    z &\sim \mathcal{N}(m, \Sigma) \\
    x &= 
        \begin{pmatrix}
        z_1 \\
        z_2 + (z_1 - m_1)^2
        \end{pmatrix} 
\end{split}
$$

In [2]:
ntargets = 5
N = 100
Xs = []
labels = []
for i in range(ntargets):
    mean = np.array([3., 2. - (i * 1.5)])
    cov = np.array([[.75, 0.], [0., .075]])
    Z1 = np.random.multivariate_normal(mean, cov, size=N)
    X1 = np.zeros_like(Z1)
    X1[:, 0] = Z1[:, 0]
    X1[:, 1] = Z1[:, 1] + (Z1[:, 0]-mean[0])** 2
    labels.append(np.ones(len(X1)) * i)
    Xs.append(X1)

idxs = np.arange(0, ntargets * N)
np.random.shuffle(idxs)
data = np.vstack(Xs)[idxs]
labels = np.hstack(labels)[idxs]

test_data = data[-100:]
test_labels = labels[-100:]
data = data[:-100]
labels = labels[:-100]


fig = figure(title='Synthetic data', width=400, height=400)
colors = ['salmon', 'blue', 'green', 'yellow', 'black', 'red', 'cyan', 'purple', 'brown', 'pink']
for sX, color in zip(Xs, colors):
    fig.circle(sX[:, 0], sX[:, 1], color=color)
show(fig)

In [3]:
labels.shape, data.shape

((400,), (400, 2))

## Model Creation

We first create the DVM.

In [6]:
# Dimension of the observed space.
obs_dim = data.shape[1]

# Dimension of the latent space. It can be bigger or smaller
# than the dimension of the observed space.
latent_dim = 2

# Number of units per hidden-layer.
n_units = 10

# Neural network structure of the encoder/decoder of the model.
enc_struct = nn.Sequential(
    nn.Linear(obs_dim, n_units),
    nn.Tanh(),
)
encoder = beer.MLPNormalDiag(enc_struct, latent_dim)
dec_struct = nn.Sequential(
    nn.Linear(obs_dim, n_units),
    nn.Tanh(),
)
decoder = beer.MLPNormalDiag(dec_struct, obs_dim)

# Model of the latent space.
# We use Mixture of normal with diagonal cov. It can be changed
# to other model.
# ----------------------------------------------------------------------

# GMM (diag cov).
prior_weights = beer.DirichletPrior(torch.ones(ntargets))
posterior_weights = beer.DirichletPrior(torch.ones(ntargets))
prior = beer.NormalGammaPrior(torch.zeros(latent_dim), torch.ones(latent_dim), 1.)
posts = [beer.NormalGammaPrior(torch.zeros(latent_dim) + torch.randn(latent_dim), torch.ones(latent_dim), 1.)
         for _ in range(ntargets)]
normalset = beer.NormalDiagonalCovarianceSet(prior, posts)
gmm_diag = beer.Mixture(prior_weights, posterior_weights, normalset)

# ----------------------------------------------------------------------

# Putting everything together to build the SVAE.
model = beer.VAE(encoder, decoder, gmm_diag, nsamples=15)

## Variational Bayes Training

In [7]:
epochs = 10000
lrate_bayesmodel = 1e-1
lrate_encoder = 0
X = torch.from_numpy(data).float()
targets = torch.from_numpy(labels).long()
loss_fn = beer.StochasticVariationalBayesLoss(len(X))

nnet_parameters = list(model.encoder.parameters()) + list(model.decoder.parameters())
std_optimizer = torch.optim.Adam(nnet_parameters, lr=lrate_encoder)
params = model.latent_model.parameters
optimizer = beer.BayesianModelOptimizer(params, lrate_bayesmodel, 
    std_optim=std_optimizer)
    
elbos = []
for epoch in range(epochs):
    optimizer.zero_grad()
    loss = loss_fn(model, Variable(X), Variable(targets))
    loss.scale(1./len(X))
    loss.backward_natural_grad()
    loss.backward()
    optimizer.step()
    
    if epoch > 0:
        elbos.append(float(loss) / len(X))

# Plot the ELBO.
fig = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
fig.line(np.arange(len(elbos)), elbos, color='blue')

show(fig)

In [8]:
d = 200
mean, var = model.evaluate(X[:d])
mean, var = mean.data.numpy(), var.data.numpy()
    
fig1 = figure(width=400, height=400)    
#for l, m in zip(labels[:d], mean):
#    fig1.circle(m[0], m[1], color=colors[int(l)])
for l, m, v in zip(labels[:d], mean, var):
    fig1.ellipse(x=m[0], y=m[1], 
                 width=2 * np.sqrt(v[0]), 
                 height=2 * np.sqrt(v[1]), 
                fill_alpha=0, color=colors[int(l)]) 

#for color, comp in zip(colors, model.latent_model.components):
#    plotting.plot_latent_model(fig1, comp, alpha=.5, color=color)

fig2 = figure(title='Data', width=400, height=400)
colors = ['salmon', 'blue', 'green', 'yellow', 'black', 'red', 'cyan', 'purple', 'brown', 'pink']
for sX, color in zip(Xs, colors):
    fig2.circle(sX[:, 0], sX[:, 1], color=color)

noise = torch.randn(10, 2) + torch.FloatTensor([3, 10.])
mean, var = model.evaluate(noise)
mean, var = mean.data.numpy(), var.data.numpy()
#for l, m, v in zip(labels[:d], mean, var):
#    fig1.ellipse(x=m[0], y=m[1], 
#                 width=2 * np.sqrt(v[0]), 
#                 height=2 * np.sqrt(v[1]), 
#                fill_alpha=0, color='grey', line_dash='dashed') 
#fig2.triangle(noise.numpy()[:, 0], noise.numpy()[:, 1], color='grey')

noise = torch.randn(10, 2) + torch.FloatTensor([3, -10.])
mean, var = model.evaluate(noise)
mean, var = mean.data.numpy(), var.data.numpy()
for l, m, v in zip(labels[:d], mean, var):
    fig1.ellipse(x=m[0], y=m[1], 
                 width=2 * np.sqrt(v[0]), 
                 height=2 * np.sqrt(v[1]), 
                fill_alpha=0, color='grey', line_dash='dashed') 
fig2.triangle(noise.numpy()[:, 0], noise.numpy()[:, 1], color='grey')

grid = gridplot([[fig1, fig2]])
show(grid)

  elif np.issubdtype(type(obj), np.float):


In [9]:
np.pi/2

1.5707963267948966

Let's see what the VAE has learnt.

In [10]:
from torch.autograd import Variable
import torch.utils.data
import torch.optim

def create_simple_dvm(indim, latent_dim, nunits, ntargets):
    enc_struct = nn.Sequential(
        nn.Linear(indim, latent_dim),
        nn.Tanh(),
        nn.Linear(indim, latent_dim)
    )
    encoder = beer.MLPNormalDiag(enc_struct, latent_dim)

    args = {
        'prior_mean': torch.zeros(latent_dim), 
        'prior_cov': torch.eye(latent_dim), 
        'prior_count': 1, 'random_init': True
    }
    latent_model = beer.Mixture.create(torch.ones(ntargets), beer.NormalDiagonalCovariance.create, args)
    #latent_model = beer.Mixture.create(torch.ones(ntargets), beer.NormalFullCovariance.create, args)
    
    return beer.DiscriminativeVariationalModel(encoder, latent_model)

def create_simple_mlp(indim, latent_dim, n_units, ntargets):
    return nn.Sequential(
        nn.Linear(indim, n_units),
        nn.Tanh(),
        nn.Linear(n_units, ntargets),
    )

def mlp_error_rate(model, features, labels):
    loss_fn = nn.CrossEntropyLoss()
    outputs = model(features)
    _, predicted = torch.max(outputs, dim=1)
    hits = (labels == predicted).float().sum()
    return (1 - hits / labels.size(0)).data

def dvm_error_rate(model, features, labels):
    loss_fn = nn.CrossEntropyLoss()
    T = model.sufficient_statistics(features)
    outputs = model.bayesian_model.log_predictions(T[1])
    _, predicted = torch.max(outputs, dim=1)
    hits = (labels == predicted).float().sum()
    return (1 - hits / labels.size(0)).data

def train_mlp(mlp, X, Z, epochs=1, lrate=1e-3, callback=None):
    dataset = torch.utils.data.TensorDataset(X, Z)
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=len(X), shuffle=True)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=lrate)
    for epoch in range(epochs):
        for i, data in enumerate(trainloader):
            inputs, labels = Variable(data[0]), Variable(data[1])
            optimizer.zero_grad()
            outputs = mlp(inputs)
            loss = loss_fn(outputs, labels)

            loss.backward()
            optimizer.step()
            if callback is not None:
                callback(float(loss / len(outputs)))

In [11]:
# Callback to monitor the training progress.
xents = []
def callback(xent):
    xents.append(xent)

mlp = create_simple_mlp(2, n_units, latent_dim, ntargets)
train_mlp(
    mlp, X, targets,
    epochs=15000, callback=callback)


# Plot the ELBO.
fig1 = figure(title='ELBO', width=400, height=400, x_axis_label='step',
              y_axis_label='ln p(X)')
fig1.line(np.arange(len(xents)), xents)

show(fig1)

In [12]:
d = 200
outputs = mlp[:-1](X)
print(outputs.size(), targets.size())
fig1 = figure(width=400, height=400)    
for l, o in zip(targets.data.numpy(), outputs.data.numpy()):
    fig1.circle(o[0], o[1], color=colors[int(l)], alpha=.1)

fig2 = figure(title='Data', width=400, height=400)
colors = ['salmon', 'blue', 'green', 'yellow', 'black', 'red', 'cyan', 'purple', 'brown', 'pink']
for sX, color in zip(Xs, colors):
    fig2.circle(sX[:, 0], sX[:, 1], color=color)

noise = torch.randn(10, 2) + torch.FloatTensor([3, 10.])
fig2.triangle(noise.numpy()[:, 0], noise.numpy()[:, 1], color='grey')
noise = mlp[:-1](noise).data
fig1.triangle(noise.numpy()[:, 0], noise.numpy()[:, 1], color='grey')

noise = torch.randn(10, 2) + torch.FloatTensor([3, -10.])
fig2.triangle(noise.numpy()[:, 0], noise.numpy()[:, 1], color='grey')
noise = mlp[:-1](noise).data
fig1.triangle(noise.numpy()[:, 0], noise.numpy()[:, 1], color='grey')

grid = gridplot([[fig1, fig2]])
show(grid)

torch.Size([400, 2]) torch.Size([400])


  elif np.issubdtype(type(obj), np.float):


In [13]:
test_X, test_targets = torch.from_numpy(test_data).float(), torch.from_numpy(test_labels).long()
test_er = mlp_error_rate(mlp, test_X, test_targets).data.numpy(), \
    dvm_error_rate(model, test_X, test_targets).data.numpy()
train_er = mlp_error_rate(mlp, X, targets).data.numpy(), \
    dvm_error_rate(model, X, targets).data.numpy()

print(train_er)
print(test_er)

AttributeError: 'VAE' object has no attribute 'bayesian_model'

In [None]:
mlp_ers = []
dvm_ers = []
ndatapoints = [1, 2, 5, 10]

for i in ndatapoints:
    print(i)
    sX, sZ = torch.from_numpy(X[:i]).float(), \
    torch.from_numpy(labels[:i]).long()
    test_sX, test_sZ = torch.from_numpy(test_X).float(), \
    torch.from_numpy(test_labels).long()
    
    print(sZ)
    
    mlp = create_simple_mlp(2, 2, 10, 10)
    train_mlp(mlp, sX, sZ, epochs=10000)

    dvm = create_simple_dvm(2, 2, 10, 10)
    beer.train_dvm(dvm, sX, sZ, max_epochs=10000, lrate=1e-3, latent_model_lrate=1e-1, kl_weight=1)
    
    mlp_ers.append(mlp_error_rate(mlp, test_sX, test_sZ).data.numpy())
    dvm_ers.append(dvm_error_rate(dvm, test_sX, test_sZ).data.numpy())

In [None]:
fig = figure(width=400, height=400)
fig.line(ndatapoints, mlp_ers, color='blue', legend='MLP')
fig.line(ndatapoints, dvm_ers, color='green', legend='DVM')
fig.legend.location = 'top_right'
show(fig)

In [None]:
dvm.latent_model.components[1].count

In [None]:
sZ