In [1]:
import pyro
import torch
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

def model():
    pass

def guide():
    pass

adam_params = {"lr": 0.005, "betas": (0.95, 0.999)}
optim = Adam(adam_params)

svi = SVI(model, guide, optim, loss=Trace_ELBO())

The up block is a demo for pyro SVI. And the following is a simple example for how to use pyro to inference the fairness of flipping a coin

# SVI part I: define the model and guide

In [2]:
import pyro.distributions as dist

def model(data):
    # define the hyperparameters that control the Beta prior
    alpha0 = torch.tensor(10.0)
    beta0 = torch.tensor(10.9)
    # sample f from the Beta prior
    f = pyro.sample("latent_fairness", dist.Beta(alpha0, beta0))
    # loop over the observed data
    for i in range(len(data)):
        # observe datapoint i using the bernoulli
        # likelihood Bernoulli(f)
        pyro.sample("obs_{}".format(i), dist.Bernoulli(f), obs=data[i])
        # pyro.observe("obs_{}".format(i), dist.Bernoulli(f), data[i])

A corresponding guide, i.e. an appropriate variational distribution for the latent random variable $f$.  
The only real requirement is that $q(f)$ should be probability distribution over the range $[0.0, 1.0]$, since $f$ doesn't make sense outside of that range.  
A simple choice is to use another Beta distribution, with parameters $\alpha_q$ and $\beta_q$. Actually, in this particular case this is the 'right' choice, since conjugacy of the Bernoulli and Beta distributions means that the exact posterior is a Beta distribution.

In [3]:
import torch.distributions.constraints as constraints
# the constraint ensures that alpha_q > 0, and that beta_q > 0, so that the posterior is well-defined.

def guide(data):
    # register the two variational parameters with Pyro.
    alpha_q = pyro.param("alpha_1", torch.tensor(15.0),
                         constraint=constraints.positive)
    beta_q = pyro.param("beta_1", torch.tensor(15.0),
                        constraint=constraints.positive)
    # sample the latent variable f from the distribution Beta(alpha_q, beta_q)
    pyro.sample("latent_fairness", dist.Beta(alpha_q, beta_q))

1. Some thing should be noted that: The names of the random variables line up exactly between the model and guide.  
2. `model(data)` and `guide(data)` take the same argument  
3. The variational parameters are `torch.tensor`s. The `requires_grad` flag is automatically set to `True` when we call `pyro.param()`.
4. `constraint=constraints.positive` to ensure that the variational parameters are positive.

In [4]:
# set up the optimizer
adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
optimizer = Adam(adam_params)

# setup the inference algorithm
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

n_steps = 5000

# do gradient steps
for step in range(n_steps):
    svi.step(data)

In [6]:
import math
import os
import torch
import torch.distributions.constraints as constraints
import pyro
from pyro.optim import Adam
from pyro.infer import SVI, Trace_ELBO
import pyro.distributions as dist

smoke_test = ('CI' in os.environ)
n_steps = 2 if smoke_test else 2000

# clear the param store in case we're in a REPL
pyro.clear_param_store()

# create some data with 6 observed heads and 9 observed tails
data = []
for _ in range(6):
    data.append(torch.tensor(1.0))
for _ in range(9):
    data.append(torch.tensor(0.))

def model(data):
    # define the hyperparameters that control the Beta prior
    alpha0 = torch.tensor(10.0)
    beta0 = torch.tensor(10.0)
    # sample f from the Beta prior
    f = pyro.sample("latent_fairness", dist.Beta(alpha0, beta0))
    # loop over the observed data
    for i in range(len(data)):
        # observe datapoint i using the bernoulli
        # likelihood Bernoulli(f)
        pyro.sample("obs_{}".format(i), dist.Bernoulli(f), obs=data[i])
        # pyro.observe("obs_{}".format(i), dist.Bernoulli(f), data[i])

def guide(data):
    # register the two variational parameters with Pyro
    alpha_q = pyro.param("alpha_q", torch.tensor(15.0),
                            constraint=constraints.positive)
    beta_q = pyro.param("beta_q", torch.tensor(15.0),
                            constraint=constraints.positive)
    # sample the latent variable f from the distribution Beta(alpha_q, beta_q)
    pyro.sample("latent_fairness", dist.Beta(alpha_q, beta_q))

# setup the optimizer
adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
optimizer = Adam(adam_params)

# setup the inference algorithm
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

# do gradient steps
for step in range(n_steps):
    svi.step(data)
    if step % 100 == 0:
        print('.', end='')

# grab the learned variational parameters
alpha_q = pyro.param("alpha_q").item()
beta_q = pyro.param("beta_q").item()

# calculate the inferred mean of the Beta posterior
inferred_mean = alpha_q / (alpha_q + beta_q)
# calculate the inferred standard deviation of the Beta posterior
factor = beta_q / (alpha_q * (1.0 + alpha_q + beta_q))
inferred_std = inferred_mean * math.sqrt(factor)

print(f"\n\n\nBased on the data and our prior belief, the fairness of the coin is {inferred_mean:.3f} " + 
      f"with a standard deviation of {inferred_std:.3f}")

....................
 Based on the data and our prior belief, the fairness of the coin is 0.461 with a standard deviation of 0.089


# SVI II: conditional independence, subsampling, and amortization

The Goal: Scaling SVI to large Datasets

The log likelihood term in the ELBO can be approximated with
$$ \Sigma_{i=1}^N \log p(x_i|z) \approx \frac{N}{M} \Sigma_{i \in I_M} \log p(x_i|z) $$
where $I_M$ is a minibatch of size $M$ drawn uniformly at random from $\{1, \ldots, N\}$.

## Marking Conditional Independence in Pyro

Pyro provides two language primitives for marking conditional independence: `plate` and `markov`.

### Sequential `plate`

In [None]:
def model(data):
    f = pyro.sample("latent_fairness", dist.Beta(alpha0, beta0))
    for i in pyro.plate("data_loop", len(data)):
        pyro.sample("obs_{}".format(i), dist.Bernoulli(f), obs=data[i])

The `pyro.plate` is very similar to `range` with one main difference: each invocation of `plate` requires 
the user to provide a unique name. The second argument is an integer just like for `range`.  


Each observed `sample` statement occurs within a different execution of the body of the `plate` loop, 
Pyro marks each observation as independent

### Vectorized `plate`

In [None]:
data = torch.zeros(10)
data[0:6] = torch.ones(6)

with pyro.plate("observe_data"):
    pyro.sample("obs", dist.Bernoulli(f), obs=data)

## Subsampling
Depending on the structure of the model and guide, Pyro supports several ways of doing subsampling
### Automatic subsampling with `plate`

In [None]:
# sequential enumeration
for i in pyro.plate("data_loop", len(data), subsample_size=5):
    pyro.sample("obs_{}".format(i), dist.Bernoulli(f), obs=data[i])

In [None]:
# vectorized enumeration
with pyro.plate("observe_data", size=100, subsample_size=5) as ind:
    pyro.sample("obs", dist.Bernoulli(f), obs=data.index_select(0, ind))

`plate` now returns a tensor of indices `ind`.  

Since this subsampling is stateless, this can lead to some problems: basically for a sufficiently large dataset even after a large number of iterations there’s a nonnegligible probability that some of the datapoints will have never been selected.

### Custom subsampling strategies with `plate`

#### Subsampling when there are only local random variables
A joint probability density given by $$ p(x,z) = \prod_{i=1}^N p(x_i|z_i)p(z_i) $$
For a model with this dependency structure the scale factor introduced by subsampling scales all the terms in the ELBO by the same amount. This is the case, for example, for a vanilla VAE. This explains why for the VAE it’s permissible for the user to take complete control over subsampling and pass mini-batches directly to the model and guide

#### Subsampling when there are both global and local random variables
Consider the model specified by the following joint distribution: $$ p(x,z,\beta) = p(\beta) \prod_{i=1}^N p(x_i|z_i)p(z_i|\beta) $$
There are N observations, and N local latent random variables. There is also a global latent random variable $\beta$.  
Our guide is a factorized distribution: $$ q(z,\beta) = q(\beta) \prod_{i=1}^N q(z_i|\beta,\lambda_i) $$

In [3]:
# For the protein-RNA multi-omics integration analysis. How to achieve the integration process, respectively and interactionally.
# How about the convolution process, respectively and interactionally. And we could predict the protein-RNA interaction between them.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

2