In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

import copy
import numpy as np
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from torch import optim, nn
from torch.autograd import Variable
import torch

from plotting import plot_model_outputs

import beer

output_notebook()

In [2]:
mean = np.array([3., 2.])
cov = np.array([[2., 0.], [0., .2]])
data = np.random.multivariate_normal(mean, cov, size=100)
data[:, 1] = data[:, 1] + (data[:, 0]-mean[0])**2

mean = data.mean(axis=0)
var = data.var(axis=0)

data = (data - mean) / np.sqrt(var)

fig = figure(
    title='Non-Linear subspace',
    width=400,
    height=400,
)
fig.circle(data[:, 0], data[:, 1])
#x = np.linspace(-1, 7, 1000)
#fig.line(x, (x-3)**2+2, color='red')
show(fig)

In [272]:
obs_dim = 2
latent_dim = 2
nb_samples = 10
nonlinearity = nn.ReLU

enc_struct = nn.Sequential(
    nn.Linear(obs_dim, 10),
    nonlinearity(),
    nn.Linear(10, 10),
    nonlinearity()
)
encoder = beer.models.MLPNormalDiag(enc_struct, 10, obs_dim)

dec_struct = nn.Sequential(
    nn.Linear(latent_dim, 10),
    nonlinearity(),
    nn.Linear(10, 10),
    nonlinearity()
)
decoder = beer.models.MLPNormalIso(dec_struct, 10, obs_dim)

#latent_model = beer.models.NormalDiagonalCovariance.create(latent_dim)
latent_model = beer.models.NaturalIsotropicGaussian(2)

model = beer.models.VAE(encoder, decoder, latent_model, nb_samples)
#model.encoder.hid_to_logvar.bias = nn.Parameter(torch.ones(2) * -5)
#model.decoder.hid_to_logvar.bias = nn.Parameter(torch.ones(1) * -5)

In [270]:
#model.encoder.hid_to_logvar.weight.requires_grad = False
#model.encoder.hid_to_logvar.weight.requires_grad = False

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
history = beer.inference.History(report_interval=250)

In [271]:
model.sample = True
beer.inference.run_training(data, model, optimizer, 100, history, batch_size=20, lrate_latent_model=0.0, kl_weight=0.0)

In [273]:
plot_model_outputs(model, data)

In [274]:
history.plot()

## Minimum Divergence 

The objective function of the Variational Bayes (VB) training is:

$$
\ln p(x) \ge \big\langle \ln P(x | z) \big\rangle_{q(z|x)} - D \big( q(z|x) || p(z) \big) = \mathcal{F}
$$

Standard VB aims to find the distribution $q(z)$ that maximizes $\mathcal{F}$. Alternately, we could hold $q(z)$ fixed and maximize $\mathcal{F}$ w.r.t. $p(z)$. As $p(z)$ occurs only in the digevergence in the RHS term, this would lead to minimize the divergence (hence *minimum-divergence*) between the posterior and the prior. For the VAE model, the prior $p(z)$ is defined as:

$$
p(z) = \mathcal{N}(z | m, \Sigma)
$$

where $m$ and $\Sigma$ are usually set to be  $0$ and the identity matrix respectively. A naive application of the *minimum-divergence* training would be to alternate between a standard VB steps (minimizing $\mathcal{F}$ w.r.t. $q(z|x)$) and the minimum-divergence step (minimizing $\mathcal{F}$ w.r.t. $p(z)$) that is, to learn the "ideal" $m$ and $\Sigma$. This solution is however not optimal because if $\Sigma$ is a full covariance matrix instead of a diagional one, the latent space will be correlated. This shortocoming can be avoided by noting that the VAE model is *overparameterized* (see [here](https://sites.google.com/site/nikobrummer/EMandMINDIV.pdf?attredirects=0)) for details), i.e. if $(\theta, m, \Sigma)$ are the parameters of the VAE, then we can find $(\theta', m', \Sigma')$ such that:

$$
\int_{\mathcal{X}} p_{\theta}(x|z)p_{m, \Sigma}(z) dz = p(x) = \int_{\mathcal{X}} p_{\theta'}(x|z)p_{m', \Sigma'}(z) dz
$$

If $p(x|z)$ has a feed-forward neural network structure, then beginning of the generative process goes as follows:

$$
\begin{align}
    z &\sim \mathcal{N}(m, \Sigma) \\
    h^{(0)} &= f(W^{(0)}z + b^{(0)}) \\
    \dots
\end{align}
$$

where $h^{(0)}$ is the activation of the first layer of the "decoder". If we set $\Sigma = LL^T$, we can define an equivalent generative process but assuming a standard normal as prior:

$$
\begin{align}
    z' &\sim \mathcal{N}(0, I) \\
    h^{(0)\prime} &= f( \underbrace{W^{(0)}L^{-1}}_{W^{(0)\prime}}z' + \underbrace{W^{(0)}m +  b^{(0)}}_{b^{(0)\prime}}) \\
    \dots
\end{align}
$$

### Minimum-Divergence step

$\DeclareMathOperator*{\argmax}{arg\,max}$

The minimum-divergence update of the VAE goes as follows:
  1. Solve $m^*, \Sigma^* = \argmax_{m, \Sigma} D\big(q(z|x) || p(z) \big)$ 
  2. Using $m^*$ and $\Sigma^*$, "normalize" the VAE neural network structue so that the marginal distribution $\int_{\mathcal{X}} p(z|x)p(x)dx = p(z)$ has zero mean and identity covariance matrix.
  
#### Finding $m^*$ and $\Sigma^*$

Assuming that $p(z)$ and $q(z|x_n)$ are from the exponential family and have the same parametric form then the problem $m^*, \Sigma^* = \argmax_{m, \Sigma} \sum_{n} D\big(q(z|x_n) || p(z) \big)$ reduces to find the parameters $m^*$ and $
Sigma^*$ such that $\langle T(z) \rangle_{p(z)} = \sum_n \langle T(z) \rangle_{q(z|x_n)}$. This is easily seen when considering the canonical parameterization of the prior and posterior:

$$
\begin{align}
    \mathcal{L} = \sum_n^N D\big(q(z|x_n) || p(z) \big) &= \sum_n^N (\eta_{q_n} - \eta_p) \big\langle T(z) \big\rangle_{q(z|x_n)} + A(\eta_p) - A(\eta_{q_n}) \\
    \nabla_{\eta_p} \mathcal{L} &= N\nabla_{\eta_p} A(\eta_p) - \sum_n^N \big\langle T(z) \big\rangle_{q(z|x_n)} \\
    \nabla_{\eta_p} \mathcal{L} = 0 &\implies \big\langle T(z) \big\rangle_{p(z)} = \frac{1}{N}\sum_n^N \big\langle T(z) \big\rangle_{q(z|x_n)}
\end{align}
$$

Assuming we have a standard VAE (i.e. neural-network):

$$
\begin{align}
    m^* &= \frac{1}{N} \sum_n^N \mu_{q_n} \\
    \Sigma^* &= \frac{1}{N} \sum_n^N \Sigma_{q_n} + \frac{1}{N} \sum_n (m_{q_n} - m^*)(m_{q_n} - m^*)^T
\end{align}
$$

In [311]:
#model = copy.deepcopy(model)
model = copy.deepcopy(new_model)

In [312]:
# Forward the data through the networks.
state = model(Variable(torch.from_numpy(data)).float())
means = state['encoder_state']['means'].data.numpy()
variances = np.exp(state['encoder_state']['logvars'].data.numpy())

# Estimate m^* and Sigma^*
m = np.mean(means, axis=0) 
m_cov = np.cov(means.T, bias=True)
S = np.diag(np.mean(variances, axis=0)) + m_cov

print(m)
print(m_cov)
print(S)

[-0.00099547  0.04510246]
[[ 1.01786978 -0.02843759]
 [-0.02843759  0.00166731]]
[[ 1.03720571 -0.02843759]
 [-0.02843759  1.00019364]]


In [313]:
print(m_cov.shape)
evals, evecs = np.linalg.eigh(m_cov)
L = evecs * evals[None, :]
inv_L = np.linalg.inv(L)
L @ inv_L

(2, 2)


array([[  1.00000000e+00,  -3.46944695e-18],
       [  3.46944695e-18,   1.00000000e+00]])

#### Parameter normalization of the VAE

Once we found the optimal prior $p(z)$, using the fact that the VAE is an *overparameterized* model, we can transform the parameters of the model such that the prior $p(z)$ is again the standard normal distribution. We shall call this step the "normalization step". 

From the previous step we have estimated the optimal mean $m^*$ and the optimal covariance matrix $\Sigma^*$ from the variational posterior distribution $q(z|x)$ (the encoder). The first step of the normalization is to change the encoder so that the marginal distribution of $q(z|x)$ will be the standard normal distribution. This can be achieve as follows:

$$
\begin{split}
    m_{q_n} &= UA h_n + U(a - m^*) \\
    \Sigma_{q_n} &= B h_n + b
\end{split}
$$

In [319]:
new_model = copy.deepcopy(model)
#new_model = model

balance = 1

L = np.linalg.cholesky(m_cov)
inv_L = np.linalg.inv(L)
L_S = np.linalg.cholesky(S)
inv_L_S = np.linalg.inv(L_S)


#evals, evecs = np.linalg.eigh(m_cov)
#inv_L = evecs.T

W, b = model.encoder.hid_to_mu.weight.data.numpy(), \
    model.encoder.hid_to_mu.bias.data.numpy()
new_W = torch.from_numpy(inv_L_S @ W).float()
new_b = torch.from_numpy(inv_L_S @ (b - m)).float()
new_model.encoder.hid_to_mu.weight = nn.Parameter(new_W)
new_model.encoder.hid_to_mu.bias = nn.Parameter(new_b)

#b = model.encoder.hid_to_logvar.bias.data.numpy()
new_b = torch.from_numpy(b - np.log(balance * variances.mean(axis=0)))
new_model.encoder.hid_to_logvar.bias = nn.Parameter(new_b)

# Update the decoder
W, b = model.decoder.structure[0].weight.data.numpy(), \
    model.decoder.structure[0].bias.data.numpy()
new_W = torch.from_numpy( W @ L_S).float()
new_b = torch.from_numpy( W @ m + b ).float()
new_model.decoder.structure[0].weight = nn.Parameter(new_W)
new_model.decoder.structure[0].bias = nn.Parameter(new_b)

In [320]:
model.sample = True
new_model.sample = True
state = model(Variable(torch.from_numpy(data)).float())
loss, llh, kl = model.loss(Variable(torch.from_numpy(data).float()), state, 1.0)
print('elbo:', -loss.data.numpy(), 'llh:', llh.data.numpy(), 'kl:', kl.data.numpy())

state = new_model(Variable(torch.from_numpy(data)).float())
loss, llh, kl = new_model.loss(Variable(torch.from_numpy(data).float()), state, 1.0)
print('elbo:', -loss.data.numpy(), 'llh:', llh.data.numpy(), 'kl:', kl.data.numpy())

model_means = state['encoder_state']['means'].data.numpy()
model_variances = np.exp(state['encoder_state']['logvars'].data.numpy())
model_mean = np.mean(model_means, axis=0)
model_cov = np.cov(model_means.T, bias=True)
S = np.diag(np.mean(model_variances, axis=0)) + model_cov
print(model_mean)
print(model_cov)
print(model_variances.mean(axis=0))
print('S:')
print(S)

model.sample = True
new_model.sample = True

elbo: [-1.26946032] llh: [ 136.0098114] kl: [ 262.95584106]
elbo: [-21.58164597] llh: [-2018.44177246] kl: [ 139.72268677]
[ -1.41859061e-07   7.34254684e-08]
[[ 0.94906799 -0.01110913]
 [-0.01110913  0.00099488]]
[ 0.27275997  1.00867772]
S:
[[ 1.22182797 -0.01110913]
 [-0.01110913  1.0096726 ]]


In [321]:
plot_model_outputs(new_model, data)

In [284]:
optimizer = optim.Adam(new_model.parameters(), lr=1e-3, weight_decay=1e-6)
history = beer.inference.History(report_interval=250)

In [308]:
new_model.sample = True
beer.inference.run_training(data, new_model, optimizer, 1000, history, batch_size=20, lrate_latent_model=0.0, kl_weight=1.0)

Epoch: 12250 	elbo: -6.779588 llh: 106.090651 kld: 241.682407
Epoch: 12500 	elbo: -6.410572 llh: 138.262352 kld: 266.473797
Epoch: 12750 	elbo: -6.184027 llh: 140.978592 kld: 264.659138
Epoch: 13000 	elbo: -6.288585 llh: 136.510427 kld: 262.282135


In [309]:
plot_model_outputs(new_model, data)

In [310]:
history.plot()

In [143]:
exp = model.latent_model.posterior.grad_lognorm()
print(1/ (exp[0] * -2))

AttributeError: 'NaturalIsotropicGaussian' object has no attribute 'posterior'