From 054c520974cbf6d642f8cd78ab65c7b0823d15e4 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 15:10:06 -0800
Subject: [PATCH 1/3] update docs

---
 README.md                    |  5 +++--
 docs/tex/index.tex           |  6 +++---
 docs/tex/troubleshooting.tex | 11 +++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 6291d733a..33dd43f77 100644
--- a/README.md
+++ b/README.md
@@ -15,8 +15,9 @@ It supports __modeling__ with
 
 + Directed graphical models
 + Neural networks (via libraries such as
-    [Keras](http://keras.io) and [TensorFlow
-    Slim](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim))
+    [`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
+    and
+    [Keras](http://keras.io))
 + Implicit generative models
 + Bayesian nonparametrics and probabilistic programs
 
diff --git a/docs/tex/index.tex b/docs/tex/index.tex
index be29c113a..006ab9925 100644
--- a/docs/tex/index.tex
+++ b/docs/tex/index.tex
@@ -14,9 +14,9 @@ \subsection{A library for probabilistic modeling, inference, and criticism.}
 \begin{itemize}
 \item Directed graphical models
 \item Neural networks (via libraries such as
-  \href{http://keras.io}{Keras} and
-  \href{https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim}{TensorFlow
-  Slim})
+  \href{\texttt{tf.layers}}{https://www.tensorflow.org/api_docs/python/tf/layers}
+  and
+  \href{http://keras.io}{Keras})
 \item Implicit generative models
 \item Bayesian nonparametrics and probabilistic programs
 \end{itemize}
diff --git a/docs/tex/troubleshooting.tex b/docs/tex/troubleshooting.tex
index 678b2acb5..3de76f97f 100644
--- a/docs/tex/troubleshooting.tex
+++ b/docs/tex/troubleshooting.tex
@@ -47,18 +47,17 @@ \subsubsection{Full Installation}
 minimal effort under a one-line interface. Observations was originally
 developed for Edward and it has since become a standalone library for
 general machine learning.
-  \item Neural networks are supported through four libraries:
+  \item Neural networks are supported through any library operating
+    on TensorFlow. For example:
   \texttt{tf.layers},
   \href{http://keras.io}{Keras} (>=1.0)
 \begin{lstlisting}[language=JSON]
 pip install keras==2.0.4
 \end{lstlisting}
+  and
   \href{https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim}{TensorFlow Slim}
-  (native in TensorFlow), and
-  \href{https://github.com/google/prettytensor}{PrettyTensor} (>=0.7.4)
-\begin{lstlisting}[language=JSON]
-pip install prettytensor
-\end{lstlisting}
+  (native in TensorFlow).
+
 Note that for Keras 2.0.5 and beyond, all neural net layer transformations cannot be directly applied on random variables anymore. For example, if \texttt{x} is a \texttt{ed.RandomVariable} object, one must call \texttt{tf.convert_to_tensor} before applying it to a layer transformation, \texttt{Dense(256)(tf.convert_to_tensor(x))}.
 See \href{https://github.com/fchollet/keras/issues/6979}{here} for more details.
   \item Notebooks require

From bbb450054d7c339245ceba703ea6cdc90fb2a5f9 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 15:47:52 -0800
Subject: [PATCH 2/3] rm '#!/usr/bin/env python'

---
 examples/bayesian_linear_regression.py              | 1 -
 examples/bayesian_linear_regression_implicitklqp.py | 1 -
 examples/bayesian_logistic_regression.py            | 1 -
 examples/bayesian_nn.py                             | 1 -
 examples/beta_bernoulli.py                          | 1 -
 examples/bigan.py                                   | 1 -
 examples/cox_process.py                             | 1 -
 examples/deep_exponential_family.py                 | 1 -
 examples/dirichlet_categorical.py                   | 1 -
 examples/factor_analysis.py                         | 1 -
 examples/gan_synthetic_data.py                      | 1 -
 examples/gan_wasserstein.py                         | 1 -
 examples/gan_wasserstein_synthetic.py               | 1 -
 examples/invgamma_normal_mh.py                      | 1 -
 examples/irt.py                                     | 1 -
 examples/iwvi.py                                    | 1 -
 examples/lstm.py                                    | 1 -
 examples/mixture_gaussian_gibbs.py                  | 1 -
 examples/mixture_gaussian_mh.py                     | 1 -
 examples/normal.py                                  | 1 -
 examples/normal_normal.py                           | 1 -
 examples/normal_sgld.py                             | 1 -
 examples/pp_dirichlet_process.py                    | 1 -
 examples/pp_dynamic_shape.py                        | 1 -
 examples/pp_persistent_randomness.py                | 1 -
 examples/pp_stochastic_control_flow.py              | 1 -
 examples/pp_stochastic_recursion.py                 | 1 -
 examples/probabilistic_matrix_factorization.py      | 1 -
 examples/probabilistic_pca_subsampling.py           | 1 -
 examples/rasch_model.py                             | 1 -
 examples/sigmoid_belief_network.py                  | 1 -
 examples/stochastic_block_model.py                  | 1 -
 examples/vae.py                                     | 1 -
 examples/vae_convolutional.py                       | 1 -
 examples/vae_convolutional_prettytensor.py          | 1 -
 tests/data/generate_test_saver.py                   | 1 -
 tests/data/generate_toy_data_tfrecords.py           | 1 -
 37 files changed, 37 deletions(-)

diff --git a/examples/bayesian_linear_regression.py b/examples/bayesian_linear_regression.py
index d814945bb..038939fd5 100644
--- a/examples/bayesian_linear_regression.py
+++ b/examples/bayesian_linear_regression.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Bayesian linear regression using stochastic gradient Hamiltonian
 Monte Carlo.
 
diff --git a/examples/bayesian_linear_regression_implicitklqp.py b/examples/bayesian_linear_regression_implicitklqp.py
index 958f52361..b694f990c 100644
--- a/examples/bayesian_linear_regression_implicitklqp.py
+++ b/examples/bayesian_linear_regression_implicitklqp.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Bayesian linear regression. Inference uses data subsampling and
 scales the log-likelihood.
 
diff --git a/examples/bayesian_logistic_regression.py b/examples/bayesian_logistic_regression.py
index 65a16be0d..a9c123e7d 100644
--- a/examples/bayesian_logistic_regression.py
+++ b/examples/bayesian_logistic_regression.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Bayesian logistic regression using Hamiltonian Monte Carlo.
 
 We visualize the fit.
diff --git a/examples/bayesian_nn.py b/examples/bayesian_nn.py
index e978d5bc2..6860c0555 100644
--- a/examples/bayesian_nn.py
+++ b/examples/bayesian_nn.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Bayesian neural network using variational inference
 (see, e.g., Blundell et al. (2015); Kucukelbir et al. (2016)).
 
diff --git a/examples/beta_bernoulli.py b/examples/beta_bernoulli.py
index c602f323c..a21f62740 100644
--- a/examples/beta_bernoulli.py
+++ b/examples/beta_bernoulli.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """A simple coin flipping example. Inspired by Stan's toy example.
 """
 from __future__ import absolute_import
diff --git a/examples/bigan.py b/examples/bigan.py
index 1b16d06e3..c857bfa9e 100644
--- a/examples/bigan.py
+++ b/examples/bigan.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Adversarially Learned Inference (Dumoulin et al., 2017), aka
 Bidirectional Generative Adversarial Networks (Donahue et al., 2017),
 for joint learning of generator and inference networks for MNIST.
diff --git a/examples/cox_process.py b/examples/cox_process.py
index 0f59e91e8..f1ddb263a 100644
--- a/examples/cox_process.py
+++ b/examples/cox_process.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """A Cox process model for spatial analysis
 (Cox, 1955; Miller et al., 2014).
 
diff --git a/examples/deep_exponential_family.py b/examples/deep_exponential_family.py
index 2fd01c9a2..330c9225f 100644
--- a/examples/deep_exponential_family.py
+++ b/examples/deep_exponential_family.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Sparse Gamma deep exponential family (Ranganath et al., 2015). We
 apply it as a topic model on the collection of NIPS 2011 conference
 papers.
diff --git a/examples/dirichlet_categorical.py b/examples/dirichlet_categorical.py
index 81bf04347..ea944368f 100644
--- a/examples/dirichlet_categorical.py
+++ b/examples/dirichlet_categorical.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Dirichlet-Categorical model.
 
 Posterior inference with Edward's BBVI.
diff --git a/examples/factor_analysis.py b/examples/factor_analysis.py
index f20f06b4b..ec3ec3d6c 100644
--- a/examples/factor_analysis.py
+++ b/examples/factor_analysis.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Logistic factor analysis on MNIST. Using Monte Carlo EM, with HMC
 for the E-step and MAP for the M-step. We fit to just one data
 point in MNIST.
diff --git a/examples/gan_synthetic_data.py b/examples/gan_synthetic_data.py
index e2ab7e9f3..aa56c9b84 100644
--- a/examples/gan_synthetic_data.py
+++ b/examples/gan_synthetic_data.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Generative adversarial network for toy Gaussian data
 (Goodfellow et al., 2014).
 
diff --git a/examples/gan_wasserstein.py b/examples/gan_wasserstein.py
index 3dc688bf6..9d7feb40f 100644
--- a/examples/gan_wasserstein.py
+++ b/examples/gan_wasserstein.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Wasserstein generative adversarial network for MNIST (Arjovsky et
 al., 2017). It modifies GANs (Goodfellow et al., 2014) to optimize
 under the Wasserstein distance.
diff --git a/examples/gan_wasserstein_synthetic.py b/examples/gan_wasserstein_synthetic.py
index 28aa7f019..a68941a1c 100644
--- a/examples/gan_wasserstein_synthetic.py
+++ b/examples/gan_wasserstein_synthetic.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Wasserstein generative adversarial network for toy Gaussian data
 (Arjovsky et al., 2017). A gradient penalty is used to approximate the
 1-Lipschitz functional family in the Wasserstein distance (Gulrajani
diff --git a/examples/invgamma_normal_mh.py b/examples/invgamma_normal_mh.py
index f959830db..d542ce486 100644
--- a/examples/invgamma_normal_mh.py
+++ b/examples/invgamma_normal_mh.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """ InverseGamma-Normal model
 
 Posterior inference with Metropolis Hastings
diff --git a/examples/irt.py b/examples/irt.py
index 5955b2492..306a63bc8 100644
--- a/examples/irt.py
+++ b/examples/irt.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Bayesian Item Response Theory (IRT) Mixed Effects Model
 using variational inference.
 
diff --git a/examples/iwvi.py b/examples/iwvi.py
index 53d6cc300..c11f926d9 100644
--- a/examples/iwvi.py
+++ b/examples/iwvi.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """A demo of how to develop new inference algorithms in Edward. Here
 we implement importance-weighted variational inference. We test it on
 logistic regression.
diff --git a/examples/lstm.py b/examples/lstm.py
index 045871b97..762c90752 100644
--- a/examples/lstm.py
+++ b/examples/lstm.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """LSTM language model on text8.
 
 Default hyperparameters achieve ~78.4 NLL at epoch 50, ~76.1423 NLL at
diff --git a/examples/mixture_gaussian_gibbs.py b/examples/mixture_gaussian_gibbs.py
index f31e6cb0a..af66b309b 100644
--- a/examples/mixture_gaussian_gibbs.py
+++ b/examples/mixture_gaussian_gibbs.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Mixture of Gaussians, with block Gibbs for inference.
 """
 from __future__ import absolute_import
diff --git a/examples/mixture_gaussian_mh.py b/examples/mixture_gaussian_mh.py
index bdd125117..5bccca711 100644
--- a/examples/mixture_gaussian_mh.py
+++ b/examples/mixture_gaussian_mh.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Mixture of Gaussians.
 
 Perform inference with Metropolis-Hastings. It utterly fails. This is
diff --git a/examples/normal.py b/examples/normal.py
index fbe32e8cf..03fdc7b62 100644
--- a/examples/normal.py
+++ b/examples/normal.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Correlated normal posterior. Inference with Hamiltonian Monte Carlo.
 """
 from __future__ import absolute_import
diff --git a/examples/normal_normal.py b/examples/normal_normal.py
index 3765068c5..215090b8c 100644
--- a/examples/normal_normal.py
+++ b/examples/normal_normal.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Normal-normal model using Hamiltonian Monte Carlo."""
 from __future__ import absolute_import
 from __future__ import division
diff --git a/examples/normal_sgld.py b/examples/normal_sgld.py
index ac5cb5011..aad47d3d2 100644
--- a/examples/normal_sgld.py
+++ b/examples/normal_sgld.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Correlated normal posterior. Inference with stochastic gradient
 Langevin dynamics.
 """
diff --git a/examples/pp_dirichlet_process.py b/examples/pp_dirichlet_process.py
index a61fe0e83..08b071d80 100644
--- a/examples/pp_dirichlet_process.py
+++ b/examples/pp_dirichlet_process.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Dirichlet process.
 
 We implement sample generation from a Dirichlet process (with no base
diff --git a/examples/pp_dynamic_shape.py b/examples/pp_dynamic_shape.py
index af2dd11b7..86237b1fc 100644
--- a/examples/pp_dynamic_shape.py
+++ b/examples/pp_dynamic_shape.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Dynamic shapes.
 
 We build a random variable whose size depends on a sample from another
diff --git a/examples/pp_persistent_randomness.py b/examples/pp_persistent_randomness.py
index 6ccdb0c51..d2e7e81f7 100644
--- a/examples/pp_persistent_randomness.py
+++ b/examples/pp_persistent_randomness.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Persistent randomness.
 
 Our language defines random variables. They enable memoization in the
diff --git a/examples/pp_stochastic_control_flow.py b/examples/pp_stochastic_control_flow.py
index 15bf18edd..60e148e93 100644
--- a/examples/pp_stochastic_control_flow.py
+++ b/examples/pp_stochastic_control_flow.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Stochastic control flow.
 
 We sample from a geometric random variable by using samples from
diff --git a/examples/pp_stochastic_recursion.py b/examples/pp_stochastic_recursion.py
index 93e837bf4..40a703ca7 100644
--- a/examples/pp_stochastic_recursion.py
+++ b/examples/pp_stochastic_recursion.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Stochastic recursion.
 
 We sample from a geometric random variable by using samples from
diff --git a/examples/probabilistic_matrix_factorization.py b/examples/probabilistic_matrix_factorization.py
index 92e9cccc8..acd51d63d 100644
--- a/examples/probabilistic_matrix_factorization.py
+++ b/examples/probabilistic_matrix_factorization.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Probabilistic matrix factorization using variational inference.
 
 Visualizes the actual and the estimated rating matrices as heatmaps.
diff --git a/examples/probabilistic_pca_subsampling.py b/examples/probabilistic_pca_subsampling.py
index aa4157cba..5dd0a6641 100644
--- a/examples/probabilistic_pca_subsampling.py
+++ b/examples/probabilistic_pca_subsampling.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Probabilistic principal components analysis (Tipping and Bishop, 1999).
 
 Inference uses data subsampling.
diff --git a/examples/rasch_model.py b/examples/rasch_model.py
index a1c3ea570..65ebf557c 100644
--- a/examples/rasch_model.py
+++ b/examples/rasch_model.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Rasch model (Rasch, 1960)."""
 from __future__ import absolute_import
 from __future__ import division
diff --git a/examples/sigmoid_belief_network.py b/examples/sigmoid_belief_network.py
index 0de8b7457..47382d258 100644
--- a/examples/sigmoid_belief_network.py
+++ b/examples/sigmoid_belief_network.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Sigmoid belief network (Neal, 1990) trained on the Caltech 101
 Silhouettes data set.
 
diff --git a/examples/stochastic_block_model.py b/examples/stochastic_block_model.py
index 28e23ecb3..cba80a867 100644
--- a/examples/stochastic_block_model.py
+++ b/examples/stochastic_block_model.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Stochastic block model."""
 from __future__ import absolute_import
 from __future__ import division
diff --git a/examples/vae.py b/examples/vae.py
index 610947c3f..1029cfc8a 100644
--- a/examples/vae.py
+++ b/examples/vae.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Variational auto-encoder for MNIST data.
 
 References
diff --git a/examples/vae_convolutional.py b/examples/vae_convolutional.py
index e53733904..537de5343 100644
--- a/examples/vae_convolutional.py
+++ b/examples/vae_convolutional.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Convolutional variational auto-encoder for binarized MNIST.
 
 The neural networks are written with TensorFlow Slim.
diff --git a/examples/vae_convolutional_prettytensor.py b/examples/vae_convolutional_prettytensor.py
index d373a84bf..bcab70260 100644
--- a/examples/vae_convolutional_prettytensor.py
+++ b/examples/vae_convolutional_prettytensor.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Convolutional variational auto-encoder for binarized MNIST.
 
 The neural networks are written with Pretty Tensor.
diff --git a/tests/data/generate_test_saver.py b/tests/data/generate_test_saver.py
index 0dc8dd15b..d5353f7bd 100644
--- a/tests/data/generate_test_saver.py
+++ b/tests/data/generate_test_saver.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Generate `test_saver`."""
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tests/data/generate_toy_data_tfrecords.py b/tests/data/generate_toy_data_tfrecords.py
index 2e2a4f560..6c3d91e61 100644
--- a/tests/data/generate_toy_data_tfrecords.py
+++ b/tests/data/generate_toy_data_tfrecords.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Generate `toy_data.tfrecords`."""
 from __future__ import absolute_import
 from __future__ import division

From a02787ca7de4b08d25e0d4cffd2c1582d5655d71 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 15:23:35 -0800
Subject: [PATCH 3/3] update examples/

---
 examples/bayesian_linear_regression.py        | 109 +++----
 ...bayesian_linear_regression_implicitklqp.py | 137 +++++----
 examples/bayesian_logistic_regression.py      | 106 ++++---
 examples/bayesian_nn.py                       | 121 ++++----
 examples/beta_bernoulli.py                    |  72 +++--
 examples/beta_bernoulli_conjugate.py          |  38 +--
 examples/bigan.py                             | 167 +++++-----
 examples/cox_process.py                       |  43 +--
 examples/deep_exponential_family.py           | 289 +++++++++---------
 examples/dirichlet_categorical.py             |  39 ++-
 10 files changed, 597 insertions(+), 524 deletions(-)

diff --git a/examples/bayesian_linear_regression.py b/examples/bayesian_linear_regression.py
index 038939fd5..8d0a6c997 100644
--- a/examples/bayesian_linear_regression.py
+++ b/examples/bayesian_linear_regression.py
@@ -19,6 +19,16 @@
 
 from edward.models import Normal, Empirical
 
+tf.flags.DEFINE_integer("N", default=40, help="Number of data points.")
+tf.flags.DEFINE_integer("D", default=1, help="Number of features.")
+tf.flags.DEFINE_integer("T", default=5000, help="Number of samples.")
+tf.flags.DEFINE_integer("nburn", default=100,
+                        help="Number of burn-in samples.")
+tf.flags.DEFINE_integer("stride", default=10,
+                        help="Frequency with which to plots samples.")
+
+FLAGS = tf.flags.FLAGS
+
 
 def build_toy_dataset(N, noise_std=0.5):
   X = np.concatenate([np.linspace(0, 2, num=N / 2),
@@ -28,73 +38,70 @@ def build_toy_dataset(N, noise_std=0.5):
   return X, y
 
 
-ed.set_seed(42)
-
-N = 40  # number of data points
-D = 1  # number of features
+def main(_):
+  ed.set_seed(42)
 
-# DATA
-X_train, y_train = build_toy_dataset(N)
-X_test, y_test = build_toy_dataset(N)
+  # DATA
+  X_train, y_train = build_toy_dataset(FLAGS.N)
+  X_test, y_test = build_toy_dataset(FLAGS.N)
 
-# MODEL
-X = tf.placeholder(tf.float32, [N, D])
-w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
-b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
-y = Normal(loc=ed.dot(X, w) + b, scale=tf.ones(N))
+  # MODEL
+  X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
+  w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
+  b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
+  y = Normal(loc=ed.dot(X, w) + b, scale=tf.ones(FLAGS.N))
 
-# INFERENCE
-T = 5000                        # Number of samples.
-nburn = 100                     # Number of burn-in samples.
-stride = 10                    # Frequency with which to plot samples.
-qw = Empirical(params=tf.Variable(tf.random_normal([T, D])))
-qb = Empirical(params=tf.Variable(tf.random_normal([T, 1])))
+  # INFERENCE
+  qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
+  qb = Empirical(params=tf.get_variable("qb/params", [FLAGS.T, 1]))
 
-inference = ed.SGHMC({w: qw, b: qb}, data={X: X_train, y: y_train})
-inference.run(step_size=1e-3)
+  inference = ed.SGHMC({w: qw, b: qb}, data={X: X_train, y: y_train})
+  inference.run(step_size=1e-3)
 
+  # CRITICISM
 
-# CRITICISM
+  # Plot posterior samples.
+  sns.jointplot(qb.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride],
+                qw.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride])
+  plt.show()
 
-# Plot posterior samples.
-sns.jointplot(qb.params.eval()[nburn:T:stride],
-              qw.params.eval()[nburn:T:stride])
-plt.show()
+  # Posterior predictive checks.
+  y_post = ed.copy(y, {w: qw, b: qb})
+  # This is equivalent to
+  # y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(FLAGS.N))
 
-# Posterior predictive checks.
-y_post = ed.copy(y, {w: qw, b: qb})
-# This is equivalent to
-# y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(N))
+  print("Mean squared error on test data:")
+  print(ed.evaluate('mean_squared_error', data={X: X_test, y_post: y_test}))
 
-print("Mean squared error on test data:")
-print(ed.evaluate('mean_squared_error', data={X: X_test, y_post: y_test}))
+  print("Displaying prior predictive samples.")
+  n_prior_samples = 10
 
-print("Displaying prior predictive samples.")
-n_prior_samples = 10
+  w_prior = w.sample(n_prior_samples).eval()
+  b_prior = b.sample(n_prior_samples).eval()
 
-w_prior = w.sample(n_prior_samples).eval()
-b_prior = b.sample(n_prior_samples).eval()
+  plt.scatter(X_train, y_train)
 
-plt.scatter(X_train, y_train)
+  inputs = np.linspace(-1, 10, num=400)
+  for ns in range(n_prior_samples):
+      output = inputs * w_prior[ns] + b_prior[ns]
+      plt.plot(inputs, output)
 
-inputs = np.linspace(-1, 10, num=400)
-for ns in range(n_prior_samples):
-    output = inputs * w_prior[ns] + b_prior[ns]
-    plt.plot(inputs, output)
+  plt.show()
 
-plt.show()
+  print("Displaying posterior predictive samples.")
+  n_posterior_samples = 10
 
-print("Displaying posterior predictive samples.")
-n_posterior_samples = 10
+  w_post = qw.sample(n_posterior_samples).eval()
+  b_post = qb.sample(n_posterior_samples).eval()
 
-w_post = qw.sample(n_posterior_samples).eval()
-b_post = qb.sample(n_posterior_samples).eval()
+  plt.scatter(X_train, y_train)
 
-plt.scatter(X_train, y_train)
+  inputs = np.linspace(-1, 10, num=400)
+  for ns in range(n_posterior_samples):
+      output = inputs * w_post[ns] + b_post[ns]
+      plt.plot(inputs, output)
 
-inputs = np.linspace(-1, 10, num=400)
-for ns in range(n_posterior_samples):
-    output = inputs * w_post[ns] + b_post[ns]
-    plt.plot(inputs, output)
+  plt.show()
 
-plt.show()
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/bayesian_linear_regression_implicitklqp.py b/examples/bayesian_linear_regression_implicitklqp.py
index b694f990c..41a72a132 100644
--- a/examples/bayesian_linear_regression_implicitklqp.py
+++ b/examples/bayesian_linear_regression_implicitklqp.py
@@ -24,7 +24,12 @@
 import tensorflow as tf
 
 from edward.models import Normal
-from tensorflow.contrib import slim
+
+tf.flags.DEFINE_integer("N", default=500, help="Number of data points.")
+tf.flags.DEFINE_integer("M", default=50, help="Batch size during training.")
+tf.flags.DEFINE_integer("D", default=2, help="Number of features.")
+
+FLAGS = tf.flags.FLAGS
 
 
 def build_toy_dataset(N, w, noise_std=0.1):
@@ -34,22 +39,6 @@ def build_toy_dataset(N, w, noise_std=0.1):
   return x, y
 
 
-def ratio_estimator(data, local_vars, global_vars):
-  """Takes as input a dict of data x, local variable samples z, and
-  global variable samples beta; outputs real values of shape
-  (x.shape[0] + z.shape[0],). In this example, there are no local
-  variables.
-  """
-  # data[y] has shape (M,); global_vars[w] has shape (D,)
-  # we concatenate w to each data point y, so input has shape (M, 1 + D)
-  input = tf.concat([
-      tf.reshape(data[y], [M, 1]),
-      tf.tile(tf.reshape(global_vars[w], [1, D]), [M, 1])], 1)
-  hidden = slim.fully_connected(input, 64, activation_fn=tf.nn.relu)
-  output = slim.fully_connected(hidden, 1, activation_fn=None)
-  return output
-
-
 def generator(arrays, batch_size):
   """Generate batches, one with respect to each array's first axis."""
   starts = [0] * len(arrays)  # pointers to where we are in iteration
@@ -69,52 +58,68 @@ def generator(arrays, batch_size):
     yield batches
 
 
-ed.set_seed(42)
-
-N = 500  # number of data points
-M = 50  # batch size during training
-D = 2  # number of features
-
-# DATA
-w_true = np.ones(D) * 5.0
-X_train, y_train = build_toy_dataset(N, w_true)
-X_test, y_test = build_toy_dataset(N, w_true)
-data = generator([X_train, y_train], M)
-
-# MODEL
-X = tf.placeholder(tf.float32, [M, D])
-y_ph = tf.placeholder(tf.float32, [M])
-w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
-y = Normal(loc=ed.dot(X, w), scale=tf.ones(M))
-
-# INFERENCE
-qw = Normal(loc=tf.Variable(tf.random_normal([D]) + 1.0),
-            scale=tf.nn.softplus(tf.Variable(tf.random_normal([D]))))
-
-inference = ed.ImplicitKLqp(
-    {w: qw}, data={y: y_ph},
-    discriminator=ratio_estimator, global_vars={w: qw})
-inference.initialize(n_iter=5000, n_print=100, scale={y: float(N) / M})
-
-sess = ed.get_session()
-tf.global_variables_initializer().run()
-
-for _ in range(inference.n_iter):
-  X_batch, y_batch = next(data)
-  for _ in range(5):
-    info_dict_d = inference.update(
-        variables="Disc", feed_dict={X: X_batch, y_ph: y_batch})
-
-  info_dict = inference.update(
-      variables="Gen", feed_dict={X: X_batch, y_ph: y_batch})
-  info_dict['loss_d'] = info_dict_d['loss_d']
-  info_dict['t'] = info_dict['t'] // 6  # say set of 6 updates is 1 iteration
-
-  t = info_dict['t']
-  inference.print_progress(info_dict)
-  if t == 1 or t % inference.n_print == 0:
-    # Check inferred posterior parameters.
-    mean, std = sess.run([qw.mean(), qw.stddev()])
-    print("\nInferred mean & std:")
-    print(mean)
-    print(std)
+def main(_):
+  def ratio_estimator(data, local_vars, global_vars):
+    """Takes as input a dict of data x, local variable samples z, and
+    global variable samples beta; outputs real values of shape
+    (x.shape[0] + z.shape[0],). In this example, there are no local
+    variables.
+    """
+    # data[y] has shape (M,); global_vars[w] has shape (D,)
+    # we concatenate w to each data point y, so input has shape (M, 1 + D)
+    input = tf.concat([
+        tf.reshape(data[y], [FLAGS.M, 1]),
+        tf.tile(tf.reshape(global_vars[w], [1, FLAGS.D]), [FLAGS.M, 1])], 1)
+    hidden = tf.layers.dense(input, 64, activation=tf.nn.relu)
+    output = tf.layers.dense(hidden, 1, activation=None)
+    return output
+
+  ed.set_seed(42)
+
+  # DATA
+  w_true = np.ones(FLAGS.D) * 5.0
+  X_train, y_train = build_toy_dataset(FLAGS.N, w_true)
+  X_test, y_test = build_toy_dataset(FLAGS.N, w_true)
+  data = generator([X_train, y_train], FLAGS.M)
+
+  # MODEL
+  X = tf.placeholder(tf.float32, [FLAGS.M, FLAGS.D])
+  y_ph = tf.placeholder(tf.float32, [FLAGS.M])
+  w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
+  y = Normal(loc=ed.dot(X, w), scale=tf.ones(FLAGS.M))
+
+  # INFERENCE
+  qw = Normal(loc=tf.get_variable("qw/loc", [FLAGS.D]) + 1.0,
+              scale=tf.nn.softplus(tf.get_variable("qw/scale", [FLAGS.D])))
+
+  inference = ed.ImplicitKLqp(
+      {w: qw}, data={y: y_ph},
+      discriminator=ratio_estimator, global_vars={w: qw})
+  inference.initialize(n_iter=5000, n_print=100,
+                       scale={y: float(FLAGS.N) / FLAGS.M})
+
+  sess = ed.get_session()
+  tf.global_variables_initializer().run()
+
+  for _ in range(inference.n_iter):
+    X_batch, y_batch = next(data)
+    for _ in range(5):
+      info_dict_d = inference.update(
+          variables="Disc", feed_dict={X: X_batch, y_ph: y_batch})
+
+    info_dict = inference.update(
+        variables="Gen", feed_dict={X: X_batch, y_ph: y_batch})
+    info_dict['loss_d'] = info_dict_d['loss_d']
+    info_dict['t'] = info_dict['t'] // 6  # say set of 6 updates is 1 iteration
+
+    t = info_dict['t']
+    inference.print_progress(info_dict)
+    if t == 1 or t % inference.n_print == 0:
+      # Check inferred posterior parameters.
+      mean, std = sess.run([qw.mean(), qw.stddev()])
+      print("\nInferred mean & std:")
+      print(mean)
+      print(std)
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/bayesian_logistic_regression.py b/examples/bayesian_logistic_regression.py
index a9c123e7d..141116bb7 100644
--- a/examples/bayesian_logistic_regression.py
+++ b/examples/bayesian_logistic_regression.py
@@ -13,6 +13,12 @@
 
 from edward.models import Bernoulli, Normal, Empirical
 
+tf.flags.DEFINE_integer("N", default=40, help="Number of data points.")
+tf.flags.DEFINE_integer("D", default=1, help="Number of features.")
+tf.flags.DEFINE_integer("T", default=5000, help="Number of samples.")
+
+FLAGS = tf.flags.FLAGS
+
 
 def build_toy_dataset(N, noise_std=0.1):
   D = 1
@@ -25,68 +31,68 @@ def build_toy_dataset(N, noise_std=0.1):
   return X, y
 
 
-ed.set_seed(42)
+def main(_):
+  ed.set_seed(42)
 
-N = 40  # number of data points
-D = 1  # number of features
+  # DATA
+  X_train, y_train = build_toy_dataset(FLAGS.N)
 
-# DATA
-X_train, y_train = build_toy_dataset(N)
+  # MODEL
+  X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
+  w = Normal(loc=tf.zeros(FLAGS.D), scale=3.0 * tf.ones(FLAGS.D))
+  b = Normal(loc=tf.zeros([]), scale=3.0 * tf.ones([]))
+  y = Bernoulli(logits=ed.dot(X, w) + b)
 
-# MODEL
-X = tf.placeholder(tf.float32, [N, D])
-w = Normal(loc=tf.zeros(D), scale=3.0 * tf.ones(D))
-b = Normal(loc=tf.zeros([]), scale=3.0 * tf.ones([]))
-y = Bernoulli(logits=ed.dot(X, w) + b)
+  # INFERENCE
+  qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
+  qb = Empirical(params=tf.get_variable("qb/params", [FLAGS.T]))
 
-# INFERENCE
-T = 5000  # number of samples
-qw = Empirical(params=tf.Variable(tf.random_normal([T, D])))
-qb = Empirical(params=tf.Variable(tf.random_normal([T])))
+  inference = ed.HMC({w: qw, b: qb}, data={X: X_train, y: y_train})
+  inference.initialize(n_print=10, step_size=0.6)
 
-inference = ed.HMC({w: qw, b: qb}, data={X: X_train, y: y_train})
-inference.initialize(n_print=10, step_size=0.6)
+  # Alternatively, use variational inference.
+  # qw_loc = tf.get_variable("qw_loc", [FLAGS.D])
+  # qw_scale = tf.nn.softplus(tf.get_variable("qw_scale", [FLAGS.D]))
+  # qb_loc = tf.get_variable("qb_loc", []) + 10.0
+  # qb_scale = tf.nn.softplus(tf.get_variable("qb_scale", []))
 
-# Alternatively, use variational inference.
-# qw_loc = tf.Variable(tf.random_normal([D]))
-# qw_scale = tf.nn.softplus(tf.Variable(tf.random_normal([D])))
-# qb_loc = tf.Variable(tf.random_normal([]) + 10)
-# qb_scale = tf.nn.softplus(tf.Variable(tf.random_normal([])))
+  # qw = Normal(loc=qw_loc, scale=qw_scale)
+  # qb = Normal(loc=qb_loc, scale=qb_scale)
 
-# qw = Normal(loc=qw_loc, scale=qw_scale)
-# qb = Normal(loc=qb_loc, scale=qb_scale)
+  # inference = ed.KLqp({w: qw, b: qb}, data={X: X_train, y: y_train})
+  # inference.initialize(n_print=10, n_iter=600)
 
-# inference = ed.KLqp({w: qw, b: qb}, data={X: X_train, y: y_train})
-# inference.initialize(n_print=10, n_iter=600)
+  tf.global_variables_initializer().run()
 
-tf.global_variables_initializer().run()
+  # Set up figure.
+  fig = plt.figure(figsize=(8, 8), facecolor='white')
+  ax = fig.add_subplot(111, frameon=False)
+  plt.ion()
+  plt.show(block=False)
 
-# Set up figure.
-fig = plt.figure(figsize=(8, 8), facecolor='white')
-ax = fig.add_subplot(111, frameon=False)
-plt.ion()
-plt.show(block=False)
+  # Build samples from inferred posterior.
+  n_samples = 50
+  inputs = np.linspace(-5, 3, num=400, dtype=np.float32).reshape((400, 1))
+  probs = tf.stack([tf.sigmoid(ed.dot(inputs, qw.sample()) + qb.sample())
+                    for _ in range(n_samples)])
 
-# Build samples from inferred posterior.
-n_samples = 50
-inputs = np.linspace(-5, 3, num=400, dtype=np.float32).reshape((400, 1))
-probs = tf.stack([tf.sigmoid(ed.dot(inputs, qw.sample()) + qb.sample())
-                  for _ in range(n_samples)])
+  for t in range(inference.n_iter):
+    info_dict = inference.update()
+    inference.print_progress(info_dict)
 
-for t in range(inference.n_iter):
-  info_dict = inference.update()
-  inference.print_progress(info_dict)
+    if t % inference.n_print == 0:
+      outputs = probs.eval()
 
-  if t % inference.n_print == 0:
-    outputs = probs.eval()
+      # Plot data and functions
+      plt.cla()
+      ax.plot(X_train[:], y_train, 'bx')
+      for s in range(n_samples):
+        ax.plot(inputs[:], outputs[s], alpha=0.2)
 
-    # Plot data and functions
-    plt.cla()
-    ax.plot(X_train[:], y_train, 'bx')
-    for s in range(n_samples):
-      ax.plot(inputs[:], outputs[s], alpha=0.2)
+      ax.set_xlim([-5, 3])
+      ax.set_ylim([-0.5, 1.5])
+      plt.draw()
+      plt.pause(1.0 / 60.0)
 
-    ax.set_xlim([-5, 3])
-    ax.set_ylim([-0.5, 1.5])
-    plt.draw()
-    plt.pause(1.0 / 60.0)
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/bayesian_nn.py b/examples/bayesian_nn.py
index 6860c0555..c7a710ffb 100644
--- a/examples/bayesian_nn.py
+++ b/examples/bayesian_nn.py
@@ -19,6 +19,11 @@
 
 from edward.models import Normal
 
+tf.flags.DEFINE_integer("N", default=40, help="Number of data points.")
+tf.flags.DEFINE_integer("D", default=1, help="Number of features.")
+
+FLAGS = tf.flags.FLAGS
+
 
 def build_toy_dataset(N=40, noise_std=0.1):
   D = 1
@@ -30,61 +35,61 @@ def build_toy_dataset(N=40, noise_std=0.1):
   return X, y
 
 
-def neural_network(X):
-  h = tf.tanh(tf.matmul(X, W_0) + b_0)
-  h = tf.tanh(tf.matmul(h, W_1) + b_1)
-  h = tf.matmul(h, W_2) + b_2
-  return tf.reshape(h, [-1])
-
-
-ed.set_seed(42)
-
-N = 40  # number of data points
-D = 1   # number of features
-
-# DATA
-X_train, y_train = build_toy_dataset(N)
-
-# MODEL
-with tf.name_scope("model"):
-  W_0 = Normal(loc=tf.zeros([D, 10]), scale=tf.ones([D, 10]), name="W_0")
-  W_1 = Normal(loc=tf.zeros([10, 10]), scale=tf.ones([10, 10]), name="W_1")
-  W_2 = Normal(loc=tf.zeros([10, 1]), scale=tf.ones([10, 1]), name="W_2")
-  b_0 = Normal(loc=tf.zeros(10), scale=tf.ones(10), name="b_0")
-  b_1 = Normal(loc=tf.zeros(10), scale=tf.ones(10), name="b_1")
-  b_2 = Normal(loc=tf.zeros(1), scale=tf.ones(1), name="b_2")
-
-  X = tf.placeholder(tf.float32, [N, D], name="X")
-  y = Normal(loc=neural_network(X), scale=0.1 * tf.ones(N), name="y")
-
-# INFERENCE
-with tf.name_scope("posterior"):
-  with tf.name_scope("qW_0"):
-    qW_0 = Normal(loc=tf.Variable(tf.random_normal([D, 10]), name="loc"),
-                  scale=tf.nn.softplus(
-                      tf.Variable(tf.random_normal([D, 10]), name="scale")))
-  with tf.name_scope("qW_1"):
-    qW_1 = Normal(loc=tf.Variable(tf.random_normal([10, 10]), name="loc"),
-                  scale=tf.nn.softplus(
-                      tf.Variable(tf.random_normal([10, 10]), name="scale")))
-  with tf.name_scope("qW_2"):
-    qW_2 = Normal(loc=tf.Variable(tf.random_normal([10, 1]), name="loc"),
-                  scale=tf.nn.softplus(
-                      tf.Variable(tf.random_normal([10, 1]), name="scale")))
-  with tf.name_scope("qb_0"):
-    qb_0 = Normal(loc=tf.Variable(tf.random_normal([10]), name="loc"),
-                  scale=tf.nn.softplus(
-                      tf.Variable(tf.random_normal([10]), name="scale")))
-  with tf.name_scope("qb_1"):
-    qb_1 = Normal(loc=tf.Variable(tf.random_normal([10]), name="loc"),
-                  scale=tf.nn.softplus(
-                      tf.Variable(tf.random_normal([10]), name="scale")))
-  with tf.name_scope("qb_2"):
-    qb_2 = Normal(loc=tf.Variable(tf.random_normal([1]), name="loc"),
-                  scale=tf.nn.softplus(
-                      tf.Variable(tf.random_normal([1]), name="scale")))
-
-inference = ed.KLqp({W_0: qW_0, b_0: qb_0,
-                     W_1: qW_1, b_1: qb_1,
-                     W_2: qW_2, b_2: qb_2}, data={X: X_train, y: y_train})
-inference.run(logdir='log')
+def main(_):
+  def neural_network(X):
+    h = tf.tanh(tf.matmul(X, W_0) + b_0)
+    h = tf.tanh(tf.matmul(h, W_1) + b_1)
+    h = tf.matmul(h, W_2) + b_2
+    return tf.reshape(h, [-1])
+  ed.set_seed(42)
+
+  # DATA
+  X_train, y_train = build_toy_dataset(FLAGS.N)
+
+  # MODEL
+  with tf.name_scope("model"):
+    W_0 = Normal(loc=tf.zeros([FLAGS.D, 10]), scale=tf.ones([FLAGS.D, 10]),
+                 name="W_0")
+    W_1 = Normal(loc=tf.zeros([10, 10]), scale=tf.ones([10, 10]), name="W_1")
+    W_2 = Normal(loc=tf.zeros([10, 1]), scale=tf.ones([10, 1]), name="W_2")
+    b_0 = Normal(loc=tf.zeros(10), scale=tf.ones(10), name="b_0")
+    b_1 = Normal(loc=tf.zeros(10), scale=tf.ones(10), name="b_1")
+    b_2 = Normal(loc=tf.zeros(1), scale=tf.ones(1), name="b_2")
+
+    X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D], name="X")
+    y = Normal(loc=neural_network(X), scale=0.1 * tf.ones(FLAGS.N), name="y")
+
+  # INFERENCE
+  with tf.variable_scope("posterior"):
+    with tf.variable_scope("qW_0"):
+      loc = tf.get_variable("loc", [FLAGS.D, 10])
+      scale = tf.nn.softplus(tf.get_variable("scale", [FLAGS.D, 10]))
+      qW_0 = Normal(loc=loc, scale=scale)
+    with tf.variable_scope("qW_1"):
+      loc = tf.get_variable("loc", [10, 10])
+      scale = tf.nn.softplus(tf.get_variable("scale", [10, 10]))
+      qW_1 = Normal(loc=loc, scale=scale)
+    with tf.variable_scope("qW_2"):
+      loc = tf.get_variable("loc", [10, 1])
+      scale = tf.nn.softplus(tf.get_variable("scale", [10, 1]))
+      qW_2 = Normal(loc=loc, scale=scale)
+    with tf.variable_scope("qb_0"):
+      loc = tf.get_variable("loc", [10])
+      scale = tf.nn.softplus(tf.get_variable("scale", [10]))
+      qb_0 = Normal(loc=loc, scale=scale)
+    with tf.variable_scope("qb_1"):
+      loc = tf.get_variable("loc", [10])
+      scale = tf.nn.softplus(tf.get_variable("scale", [10]))
+      qb_1 = Normal(loc=loc, scale=scale)
+    with tf.variable_scope("qb_2"):
+      loc = tf.get_variable("loc", [1])
+      scale = tf.nn.softplus(tf.get_variable("scale", [1]))
+      qb_2 = Normal(loc=loc, scale=scale)
+
+  inference = ed.KLqp({W_0: qW_0, b_0: qb_0,
+                       W_1: qW_1, b_1: qb_1,
+                       W_2: qW_2, b_2: qb_2}, data={X: X_train, y: y_train})
+  inference.run(logdir='log')
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/beta_bernoulli.py b/examples/beta_bernoulli.py
index a21f62740..c3a674091 100644
--- a/examples/beta_bernoulli.py
+++ b/examples/beta_bernoulli.py
@@ -11,36 +11,42 @@
 
 from edward.models import Bernoulli, Beta, Empirical
 
-ed.set_seed(42)
-
-# DATA
-x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
-
-# MODEL
-p = Beta(1.0, 1.0)
-x = Bernoulli(probs=p, sample_shape=10)
-
-# INFERENCE
-qp = Empirical(params=tf.Variable(tf.zeros([1000]) + 0.5))
-
-proposal_p = Beta(3.0, 9.0)
-
-inference = ed.MetropolisHastings({p: qp}, {p: proposal_p}, data={x: x_data})
-inference.run()
-
-# CRITICISM
-# exact posterior has mean 0.25 and std 0.12
-sess = ed.get_session()
-mean, stddev = sess.run([qp.mean(), qp.stddev()])
-print("Inferred posterior mean:")
-print(mean)
-print("Inferred posterior stddev:")
-print(stddev)
-
-x_post = ed.copy(x, {p: qp})
-tx_rep, tx = ed.ppc(
-    lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
-    data={x_post: x_data})
-ed.ppc_stat_hist_plot(
-    tx[0], tx_rep, stat_name=r'$T \equiv$mean', bins=10)
-plt.show()
+
+def main(_):
+  ed.set_seed(42)
+
+  # DATA
+  x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
+
+  # MODEL
+  p = Beta(1.0, 1.0)
+  x = Bernoulli(probs=p, sample_shape=10)
+
+  # INFERENCE
+  qp = Empirical(params=tf.get_variable(
+      "qp/params", [1000], initializer=tf.constant_initializer(0.5)))
+
+  proposal_p = Beta(3.0, 9.0)
+
+  inference = ed.MetropolisHastings({p: qp}, {p: proposal_p}, data={x: x_data})
+  inference.run()
+
+  # CRITICISM
+  # exact posterior has mean 0.25 and std 0.12
+  sess = ed.get_session()
+  mean, stddev = sess.run([qp.mean(), qp.stddev()])
+  print("Inferred posterior mean:")
+  print(mean)
+  print("Inferred posterior stddev:")
+  print(stddev)
+
+  x_post = ed.copy(x, {p: qp})
+  tx_rep, tx = ed.ppc(
+      lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
+      data={x_post: x_data})
+  ed.ppc_stat_hist_plot(
+      tx[0], tx_rep, stat_name=r'$T \equiv$mean', bins=10)
+  plt.show()
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/beta_bernoulli_conjugate.py b/examples/beta_bernoulli_conjugate.py
index dbff3010a..9dc586766 100644
--- a/examples/beta_bernoulli_conjugate.py
+++ b/examples/beta_bernoulli_conjugate.py
@@ -13,25 +13,29 @@
 
 from edward.models import Bernoulli, Beta
 
-ed.set_seed(42)
 
-# DATA
-x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
+def main(_):
+  ed.set_seed(42)
 
-# MODEL
-p = Beta(1.0, 1.0)
-x = Bernoulli(probs=p, sample_shape=10)
+  # DATA
+  x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
 
-# COMPLETE CONDITIONAL
-p_cond = ed.complete_conditional(p)
+  # MODEL
+  p = Beta(1.0, 1.0)
+  x = Bernoulli(probs=p, sample_shape=10)
 
-sess = ed.get_session()
-tf.global_variables_initializer().run()
+  # COMPLETE CONDITIONAL
+  p_cond = ed.complete_conditional(p)
 
-print('p(probs | x) type:', p_cond.parameters['name'])
-param_vals = sess.run({key: val for
-                       key, val in six.iteritems(p_cond.parameters)
-                       if isinstance(val, tf.Tensor)}, {x: x_data})
-print('parameters:')
-for key, val in six.iteritems(param_vals):
-  print('%s:\t%.3f' % (key, val))
+  sess = ed.get_session()
+
+  print('p(probs | x) type:', p_cond.parameters['name'])
+  param_vals = sess.run({key: val for
+                         key, val in six.iteritems(p_cond.parameters)
+                         if isinstance(val, tf.Tensor)}, {x: x_data})
+  print('parameters:')
+  for key, val in six.iteritems(param_vals):
+    print('%s:\t%.3f' % (key, val))
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/bigan.py b/examples/bigan.py
index c857bfa9e..373c1fd70 100644
--- a/examples/bigan.py
+++ b/examples/bigan.py
@@ -14,9 +14,20 @@
 import tensorflow as tf
 
 from observations import mnist
-from tensorflow.contrib import slim
 
-leak = 0.2  # leak parameter for leakyReLU
+tf.flags.DEFINE_string("data_dir", default="tmp/data", help="")
+tf.flags.DEFINE_string("out_dir", default="tmp/out", help="")
+tf.flags.DEFINE_integer("M", default=100, help="Batch size during training.")
+tf.flags.DEFINE_integer("d", default=50, help="Latent dimension.")
+tf.flags.DEFINE_float("leak", default=0.2,
+                      help="Leak parameter for leakyReLU.")
+tf.flags.DEFINE_integer("hidden_units", default=300, help="")
+tf.flags.DEFINE_float("encoder_variance", default=0.01,
+                      help="Set to 0 for deterministic encoder.")
+
+FLAGS = tf.flags.FLAGS
+if not os.path.exists(FLAGS.out_dir):
+  os.makedirs(FLAGS.out_dir)
 
 
 def generator(array, batch_size):
@@ -36,27 +47,28 @@ def generator(array, batch_size):
     yield batch
 
 
-def leakyrelu(x, alpha=leak):
+def leakyrelu(x, alpha=FLAGS.leak):
   return tf.maximum(x, alpha * x)
 
 
 def gen_latent(x, hidden_units):
-  h = slim.fully_connected(x, hidden_units, activation_fn=leakyrelu)
-  z = slim.fully_connected(h, d, activation_fn=None)
-  return z + np.sqrt(encoder_variance) * np.random.normal(0.0, 1.0, np.shape(z))
+  net = tf.layers.dense(x, hidden_units, activation=leakyrelu)
+  net = tf.layers.dense(net, FLAGS.d, activation=None)
+  return (net + np.sqrt(FLAGS.encoder_variance) *
+          np.random.normal(0.0, 1.0, np.shape(z)))
 
 
 def gen_data(z, hidden_units):
-  h = slim.fully_connected(z, hidden_units, activation_fn=leakyrelu)
-  x = slim.fully_connected(h, 784, activation_fn=tf.sigmoid)
-  return x
+  net = tf.layers.dense(z, hidden_units, activation=leakyrelu)
+  net = tf.layers.dense(net, 784, activation=tf.sigmoid)
+  return net
 
 
 def discriminative_network(x, y):
   # Discriminator must output probability in logits
-  inputs = tf.concat([x, y], 1)
-  h1 = slim.fully_connected(inputs, hidden_units, activation_fn=leakyrelu)
-  logit = slim.fully_connected(h1, 1, activation_fn=None)
+  net = tf.concat([x, y], 1)
+  net = tf.layers.dense(net, FLAGS.hidden_units, activation=leakyrelu)
+  net = tf.layers.dense(net, 1, activation=None)
   return logit
 
 
@@ -77,71 +89,66 @@ def plot(samples):
   return fig
 
 
-ed.set_seed(42)
-
-data_dir = "/tmp/data"
-out_dir = "/tmp/out"
-if not os.path.exists(out_dir):
-  os.makedirs(out_dir)
-M = 100  # batch size during training
-d = 50  # latent dimension
-hidden_units = 300
-encoder_variance = 0.01  # Set to 0 for deterministic encoder
-
-# DATA. MNIST batches are fed at training time.
-(x_train, _), (x_test, _) = mnist(data_dir)
-x_train_generator = generator(x_train, M)
-x_ph = tf.placeholder(tf.float32, [M, 784])
-z_ph = tf.placeholder(tf.float32, [M, d])
-
-# MODEL
-with tf.variable_scope("Gen"):
-  xf = gen_data(z_ph, hidden_units)
-  zf = gen_latent(x_ph, hidden_units)
-
-# INFERENCE:
-optimizer = tf.train.AdamOptimizer()
-optimizer_d = tf.train.AdamOptimizer()
-inference = ed.BiGANInference(
-    latent_vars={zf: z_ph}, data={xf: x_ph},
-    discriminator=discriminative_network)
-
-inference.initialize(
-    optimizer=optimizer, optimizer_d=optimizer_d, n_iter=100000, n_print=3000)
-
-sess = ed.get_session()
-init_op = tf.global_variables_initializer()
-sess.run(init_op)
-
-idx = np.random.randint(M, size=16)
-i = 0
-for t in range(inference.n_iter):
-  if t % inference.n_print == 1:
-
-    samples = sess.run(xf, feed_dict={z_ph: z_batch})
-    samples = samples[idx, ]
-    fig = plot(samples)
-    plt.savefig(os.path.join(out_dir, '{}{}.png').format(
-        'Generated', str(i).zfill(3)), bbox_inches='tight')
-    plt.close(fig)
-
-    fig = plot(x_batch[idx, ])
-    plt.savefig(os.path.join(out_dir, '{}{}.png').format(
-        'Base', str(i).zfill(3)), bbox_inches='tight')
-    plt.close(fig)
-
-    zsam = sess.run(zf, feed_dict={x_ph: x_batch})
-    reconstructions = sess.run(xf, feed_dict={z_ph: zsam})
-    reconstructions = reconstructions[idx, ]
-    fig = plot(reconstructions)
-    plt.savefig(os.path.join(out_dir, '{}{}.png').format(
-        'Reconstruct', str(i).zfill(3)), bbox_inches='tight')
-    plt.close(fig)
-
-    i += 1
-
-  x_batch = next(x_train_generator)
-  z_batch = np.random.normal(0, 1, [M, d])
-
-  info_dict = inference.update(feed_dict={x_ph: x_batch, z_ph: z_batch})
-  inference.print_progress(info_dict)
+def main(_):
+  ed.set_seed(42)
+
+  # DATA. MNIST batches are fed at training time.
+  (x_train, _), (x_test, _) = mnist(FLAGS.data_dir)
+  x_train_generator = generator(x_train, FLAGS.M)
+  x_ph = tf.placeholder(tf.float32, [FLAGS.M, 784])
+  z_ph = tf.placeholder(tf.float32, [FLAGS.M, FLAGS.d])
+
+  # MODEL
+  with tf.variable_scope("Gen"):
+    xf = gen_data(z_ph, FLAGS.hidden_units)
+    zf = gen_latent(x_ph, FLAGS.hidden_units)
+
+  # INFERENCE:
+  optimizer = tf.train.AdamOptimizer()
+  optimizer_d = tf.train.AdamOptimizer()
+  inference = ed.BiGANInference(
+      latent_vars={zf: z_ph}, data={xf: x_ph},
+      discriminator=discriminative_network)
+
+  inference.initialize(
+      optimizer=optimizer, optimizer_d=optimizer_d, n_iter=100000, n_print=3000)
+
+  sess = ed.get_session()
+  init_op = tf.global_variables_initializer()
+  sess.run(init_op)
+
+  idx = np.random.randint(FLAGS.M, size=16)
+  i = 0
+  for t in range(inference.n_iter):
+    if t % inference.n_print == 1:
+
+      samples = sess.run(xf, feed_dict={z_ph: z_batch})
+      samples = samples[idx, ]
+      fig = plot(samples)
+      plt.savefig(os.path.join(FLAGS.out_dir, '{}{}.png').format(
+          'Generated', str(i).zfill(3)), bbox_inches='tight')
+      plt.close(fig)
+
+      fig = plot(x_batch[idx, ])
+      plt.savefig(os.path.join(FLAGS.out_dir, '{}{}.png').format(
+          'Base', str(i).zfill(3)), bbox_inches='tight')
+      plt.close(fig)
+
+      zsam = sess.run(zf, feed_dict={x_ph: x_batch})
+      reconstructions = sess.run(xf, feed_dict={z_ph: zsam})
+      reconstructions = reconstructions[idx, ]
+      fig = plot(reconstructions)
+      plt.savefig(os.path.join(FLAGS.out_dir, '{}{}.png').format(
+          'Reconstruct', str(i).zfill(3)), bbox_inches='tight')
+      plt.close(fig)
+
+      i += 1
+
+    x_batch = next(x_train_generator)
+    z_batch = np.random.normal(0, 1, [FLAGS.M, FLAGS.d])
+
+    info_dict = inference.update(feed_dict={x_ph: x_batch, z_ph: z_batch})
+    inference.print_progress(info_dict)
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/cox_process.py b/examples/cox_process.py
index f1ddb263a..1a6f50dbb 100644
--- a/examples/cox_process.py
+++ b/examples/cox_process.py
@@ -28,6 +28,11 @@
 from edward.util import rbf
 from scipy.stats import multivariate_normal, poisson
 
+tf.flags.DEFINE_integer("N", default=308, help="Number of NBA players.")
+tf.flags.DEFINE_integer("V", default=2, help="Number of shot locations.")
+
+FLAGS = tf.flags.FLAGS
+
 
 def build_toy_dataset(N, V):
   """A simulator mimicking the data set from 2015-2016 NBA season with
@@ -42,26 +47,30 @@ def build_toy_dataset(N, V):
 
   return x
 
-ed.set_seed(42)
 
-N = 308  # number of NBA players
-V = 2  # number of shot locations
+def main(_):
+  ed.set_seed(42)
+
+  # DATA
+  x_data = build_toy_dataset(FLAGS.N, FLAGS.V)
 
-# DATA
-x_data = build_toy_dataset(N, V)
+  # MODEL
+  x_ph = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.V])
 
-# MODEL
-x_ph = tf.placeholder(tf.float32, [N, V])  # inputs to Gaussian Process
+  # Form (N, V, V) covariance, one matrix per data point.
+  K = tf.stack([rbf(tf.reshape(xn, [FLAGS.V, 1])) + tf.diag([1e-6, 1e-6])
+                for xn in tf.unstack(x_ph)])
+  f = MultivariateNormalTriL(loc=tf.zeros([FLAGS.N, FLAGS.V]),
+                             scale_tril=tf.cholesky(K))
+  x = Poisson(rate=tf.exp(f))
 
-# Form (N, V, V) covariance, one matrix per data point.
-K = tf.stack([rbf(tf.reshape(xn, [V, 1])) + tf.diag([1e-6, 1e-6])
-              for xn in tf.unstack(x_ph)])
-f = MultivariateNormalTriL(loc=tf.zeros([N, V]), scale_tril=tf.cholesky(K))
-x = Poisson(rate=tf.exp(f))
+  # INFERENCE
+  qf = Normal(
+      loc=tf.get_variable("qf/loc", [FLAGS.N, FLAGS.V]),
+      scale=tf.nn.softplus(tf.get_variable("qf/scale", [FLAGS.N, FLAGS.V])))
 
-# INFERENCE
-qf = Normal(loc=tf.Variable(tf.random_normal([N, V])),
-            scale=tf.nn.softplus(tf.Variable(tf.random_normal([N, V]))))
+  inference = ed.KLqp({f: qf}, data={x: x_data, x_ph: x_data})
+  inference.run(n_iter=5000)
 
-inference = ed.KLqp({f: qf}, data={x: x_data, x_ph: x_data})
-inference.run(n_iter=5000)
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/deep_exponential_family.py b/examples/deep_exponential_family.py
index 330c9225f..513f36a44 100644
--- a/examples/deep_exponential_family.py
+++ b/examples/deep_exponential_family.py
@@ -77,141 +77,156 @@ class objects visual
 from edward.util import Progbar
 from observations import nips
 
-ed.set_seed(42)
-
-data_dir = "~/data"
-logdir = '~/log/def/'
-data_dir = os.path.expanduser(data_dir)
-logdir = os.path.expanduser(logdir)
-
-# DATA
-x_train, metadata = nips(data_dir)
-documents = metadata['columns']
-words = metadata['rows']
-
-# Subset to documents in 2011 and words appearing in at least two
-# documents and have a total word count of at least 10.
-doc_idx = [i for i, document in enumerate(documents)
-           if document.startswith('2011')]
-documents = [documents[doc] for doc in doc_idx]
-x_train = x_train[:, doc_idx]
-word_idx = np.logical_and(np.sum(x_train != 0, 1) >= 2,
-                          np.sum(x_train, 1) >= 10)
-words = [word for word, idx in zip(words, word_idx) if idx]
-x_train = x_train[word_idx, :]
-x_train = x_train.T
-
-N = x_train.shape[0]  # number of documents
-D = x_train.shape[1]  # vocabulary size
-K = [100, 30, 15]  # number of components per layer
-q = 'lognormal'  # choice of q; 'lognormal' or 'gamma'
-shape = 0.1  # gamma shape parameter
-lr = 1e-4  # learning rate step-size
-
-# MODEL
-W2 = Gamma(0.1, 0.3, sample_shape=[K[2], K[1]])
-W1 = Gamma(0.1, 0.3, sample_shape=[K[1], K[0]])
-W0 = Gamma(0.1, 0.3, sample_shape=[K[0], D])
-
-z3 = Gamma(0.1, 0.1, sample_shape=[N, K[2]])
-z2 = Gamma(shape, shape / tf.matmul(z3, W2))
-z1 = Gamma(shape, shape / tf.matmul(z2, W1))
-x = Poisson(tf.matmul(z1, W0))
-
-
-# INFERENCE
-def pointmass_q(shape):
-  min_mean = 1e-3
-  mean_init = tf.random_normal(shape)
-  rv = PointMass(tf.maximum(tf.nn.softplus(tf.Variable(mean_init)), min_mean))
-  return rv
-
-
-def gamma_q(shape):
-  # Parameterize Gamma q's via shape and scale, with softplus unconstraints.
-  min_shape = 1e-3
-  min_scale = 1e-5
-  shape_init = 0.5 + 0.1 * tf.random_normal(shape)
-  scale_init = 0.1 * tf.random_normal(shape)
-  rv = Gamma(tf.maximum(tf.nn.softplus(tf.Variable(shape_init)),
-                        min_shape),
-             tf.maximum(1.0 / tf.nn.softplus(tf.Variable(scale_init)),
-                        1.0 / min_scale))
-  return rv
-
-
-def lognormal_q(shape):
-  min_scale = 1e-5
-  loc_init = tf.random_normal(shape)
-  scale_init = 0.1 * tf.random_normal(shape)
-  rv = TransformedDistribution(
-      distribution=Normal(
-          tf.Variable(loc_init),
-          tf.maximum(tf.nn.softplus(tf.Variable(scale_init)), min_scale)),
-      bijector=tf.contrib.distributions.bijectors.Exp())
-  return rv
-
-
-qW2 = pointmass_q(W2.shape)
-qW1 = pointmass_q(W1.shape)
-qW0 = pointmass_q(W0.shape)
-if q == 'gamma':
-  qz3 = gamma_q(z3.shape)
-  qz2 = gamma_q(z2.shape)
-  qz1 = gamma_q(z1.shape)
-else:
-  qz3 = lognormal_q(z3.shape)
-  qz2 = lognormal_q(z2.shape)
-  qz1 = lognormal_q(z1.shape)
-
-# We apply variational EM with E-step over local variables
-# and M-step to point estimate the global weight matrices.
-inference_e = ed.KLqp({z1: qz1, z2: qz2, z3: qz3},
-                      data={x: x_train, W0: qW0, W1: qW1, W2: qW2})
-inference_m = ed.MAP({W0: qW0, W1: qW1, W2: qW2},
-                     data={x: x_train, z1: qz1, z2: qz2, z3: qz3})
-
-optimizer_e = tf.train.RMSPropOptimizer(lr)
-optimizer_m = tf.train.RMSPropOptimizer(lr)
+tf.flags.DEFINE_string("data_dir", default="~/data", help="")
+tf.flags.DEFINE_string("logdir", default="~/log/def/", help="")
+tf.flags.DEFINE_list("K", default=[100, 30, 15],
+                     help="Number of components per layer.")
+tf.flags.DEFINE_string("q", default="lognormal",
+                       help="Choice of q; 'lognormal' or 'gamma'.")
+tf.flags.DEFINE_float("shape", default=0.1, help="Gamma shape parameter.")
+tf.flags.DEFINE_float("lr", default=1e-4, help="Learning rate step-size.")
+
+FLAGS = tf.flags.FLAGS
+FLAGS.data_dir = os.path.expanduser(FLAGS.data_dir)
+FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
 timestamp = datetime.strftime(datetime.utcnow(), "%Y%m%d_%H%M%S")
-logdir += timestamp + '_' + '_'.join([str(ks) for ks in K]) + \
-    '_q_' + str(q) + '_lr_' + str(lr)
-kwargs = {'optimizer': optimizer_e,
-          'n_print': 100,
-          'logdir': logdir,
-          'log_timestamp': False}
-if q == 'gamma':
-  kwargs['n_samples'] = 30
-inference_e.initialize(**kwargs)
-inference_m.initialize(optimizer=optimizer_m)
-
-sess = ed.get_session()
-tf.global_variables_initializer().run()
-
-n_epoch = 20
-n_iter_per_epoch = 10000
-for epoch in range(n_epoch):
-  print("Epoch {}".format(epoch))
-  nll = 0.0
-
-  pbar = Progbar(n_iter_per_epoch)
-  for t in range(1, n_iter_per_epoch + 1):
-    pbar.update(t)
-    info_dict_e = inference_e.update()
-    info_dict_m = inference_m.update()
-    nll += info_dict_e['loss']
-
-  # Compute perplexity averaged over a number of training iterations.
-  # The model's negative log-likelihood of data is upper bounded by
-  # the variational objective.
-  nll = nll / n_iter_per_epoch
-  perplexity = np.exp(nll / np.sum(x_train))
-  print("Negative log-likelihood <= {:0.3f}".format(nll))
-  print("Perplexity <= {:0.3f}".format(perplexity))
-
-  # Print top 10 words for first 10 topics.
-  qW0_vals = sess.run(qW0)
-  for k in range(10):
-    top_words_idx = qW0_vals[k, :].argsort()[-10:][::-1]
-    top_words = " ".join([words[i] for i in top_words_idx])
-    print("Topic {}: {}".format(k, top_words))
+FLAGS.logdir += timestamp + '_' + '_'.join([str(ks) for ks in FLAGS.K]) + \
+    '_q_' + str(FLAGS.q) + '_lr_' + str(FLAGS.lr)
+
+
+def pointmass_q(shape, name=None):
+  with tf.variable_scope(name, default_name="pointmass_q"):
+    min_mean = 1e-3
+    mean = tf.get_variable("mean", shape)
+    rv = PointMass(tf.maximum(tf.nn.softplus(mean), min_mean))
+    return rv
+
+
+def gamma_q(shape, name=None):
+  # Parameterize Gamma q's via shape and scale, with softplus unconstraints.
+  with tf.variable_scope(name, default_name="gamma_q"):
+    min_shape = 1e-3
+    min_scale = 1e-5
+    shape_init = 0.5 + 0.1 * tf.random_normal(shape)
+    scale_init = 0.1 * tf.random_normal(shape)
+    shape = tf.get_variable("shape", shape,
+                            initializer=tf.constant_initializer(shape_init))
+    scale = tf.get_variable("scale", shape,
+                            initializer=tf.constant_initializer(shape_init))
+    rv = Gamma(tf.maximum(tf.nn.softplus(shape), min_shape),
+               tf.maximum(1.0 / tf.nn.softplus(scale), 1.0 / min_scale))
+    return rv
+
+
+def lognormal_q(shape, name=None):
+  with tf.variable_scope(name, default_name="lognormal_q"):
+    min_scale = 1e-5
+    loc_init = tf.random_normal(shape)
+    scale_init = 0.1 * tf.random_normal(shape)
+    loc = tf.get_variable("loc", shape,
+                          initializer=tf.constant_initializer(loc_init))
+    scale = tf.get_variable("scale", shape,
+                            initializer=tf.constant_initializer(scale_init))
+    rv = TransformedDistribution(
+        distribution=Normal(loc, tf.maximum(tf.nn.softplus(scale), min_scale)),
+        bijector=tf.contrib.distributions.bijectors.Exp())
+    return rv
+
+
+def main(_):
+  ed.set_seed(42)
+
+  # DATA
+  x_train, metadata = nips(FLAGS.data_dir)
+  documents = metadata['columns']
+  words = metadata['rows']
+
+  # Subset to documents in 2011 and words appearing in at least two
+  # documents and have a total word count of at least 10.
+  doc_idx = [i for i, document in enumerate(documents)
+             if document.startswith('2011')]
+  documents = [documents[doc] for doc in doc_idx]
+  x_train = x_train[:, doc_idx]
+  word_idx = np.logical_and(np.sum(x_train != 0, 1) >= 2,
+                            np.sum(x_train, 1) >= 10)
+  words = [word for word, idx in zip(words, word_idx) if idx]
+  x_train = x_train[word_idx, :]
+  x_train = x_train.T
+
+  N = x_train.shape[0]  # number of documents
+  D = x_train.shape[1]  # vocabulary size
+
+  # MODEL
+  W2 = Gamma(0.1, 0.3, sample_shape=[FLAGS.K[2], FLAGS.K[1]])
+  W1 = Gamma(0.1, 0.3, sample_shape=[FLAGS.K[1], FLAGS.K[0]])
+  W0 = Gamma(0.1, 0.3, sample_shape=[FLAGS.K[0], D])
+
+  z3 = Gamma(0.1, 0.1, sample_shape=[N, FLAGS.K[2]])
+  z2 = Gamma(FLAGS.shape, FLAGS.shape / tf.matmul(z3, W2))
+  z1 = Gamma(FLAGS.shape, FLAGS.shape / tf.matmul(z2, W1))
+  x = Poisson(tf.matmul(z1, W0))
+
+  # INFERENCE
+  qW2 = pointmass_q(W2.shape)
+  qW1 = pointmass_q(W1.shape)
+  qW0 = pointmass_q(W0.shape)
+  if FLAGS.q == 'gamma':
+    qz3 = gamma_q(z3.shape)
+    qz2 = gamma_q(z2.shape)
+    qz1 = gamma_q(z1.shape)
+  else:
+    qz3 = lognormal_q(z3.shape)
+    qz2 = lognormal_q(z2.shape)
+    qz1 = lognormal_q(z1.shape)
+
+  # We apply variational EM with E-step over local variables
+  # and M-step to point estimate the global weight matrices.
+  inference_e = ed.KLqp({z1: qz1, z2: qz2, z3: qz3},
+                        data={x: x_train, W0: qW0, W1: qW1, W2: qW2})
+  inference_m = ed.MAP({W0: qW0, W1: qW1, W2: qW2},
+                       data={x: x_train, z1: qz1, z2: qz2, z3: qz3})
+
+  optimizer_e = tf.train.RMSPropOptimizer(FLAGS.lr)
+  optimizer_m = tf.train.RMSPropOptimizer(FLAGS.lr)
+  kwargs = {'optimizer': optimizer_e,
+            'n_print': 100,
+            'logdir': FLAGS.logdir,
+            'log_timestamp': False}
+  if FLAGS.q == 'gamma':
+    kwargs['n_samples'] = 30
+  inference_e.initialize(**kwargs)
+  inference_m.initialize(optimizer=optimizer_m)
+
+  sess = ed.get_session()
+  tf.global_variables_initializer().run()
+
+  n_epoch = 20
+  n_iter_per_epoch = 10000
+  for epoch in range(n_epoch):
+    print("Epoch {}".format(epoch))
+    nll = 0.0
+
+    pbar = Progbar(n_iter_per_epoch)
+    for t in range(1, n_iter_per_epoch + 1):
+      pbar.update(t)
+      info_dict_e = inference_e.update()
+      info_dict_m = inference_m.update()
+      nll += info_dict_e['loss']
+
+    # Compute perplexity averaged over a number of training iterations.
+    # The model's negative log-likelihood of data is upper bounded by
+    # the variational objective.
+    nll = nll / n_iter_per_epoch
+    perplexity = np.exp(nll / np.sum(x_train))
+    print("Negative log-likelihood <= {:0.3f}".format(nll))
+    print("Perplexity <= {:0.3f}".format(perplexity))
+
+    # Print top 10 words for first 10 topics.
+    qW0_vals = sess.run(qW0)
+    for k in range(10):
+      top_words_idx = qW0_vals[k, :].argsort()[-10:][::-1]
+      top_words = " ".join([words[i] for i in top_words_idx])
+      print("Topic {}: {}".format(k, top_words))
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/examples/dirichlet_categorical.py b/examples/dirichlet_categorical.py
index ea944368f..05a5645dd 100644
--- a/examples/dirichlet_categorical.py
+++ b/examples/dirichlet_categorical.py
@@ -12,23 +12,32 @@
 
 from edward.models import Categorical, Dirichlet
 
-N = 1000
-K = 4
+tf.flags.DEFINE_integer("N", default=1000, help="")
+tf.flags.DEFINE_integer("K", default=4, help="")
 
-# DATA
-pi_true = np.random.dirichlet(np.array([20.0, 30.0, 10.0, 10.0]))
-z_data = np.array([np.random.choice(K, 1, p=pi_true)[0] for n in range(N)])
-print('pi={}'.format(pi_true))
+FLAGS = tf.flags.FLAGS
 
-# MODEL
-pi = Dirichlet(tf.ones(4))
-z = Categorical(probs=tf.ones([N, 1]) * pi)
 
-# INFERENCE
-qpi = Dirichlet(tf.nn.softplus(tf.Variable(tf.random_normal([K]))))
+def main(_):
+  # DATA
+  pi_true = np.random.dirichlet(np.array([20.0, 30.0, 10.0, 10.0]))
+  z_data = np.array([np.random.choice(FLAGS.K, 1, p=pi_true)[0]
+                     for n in range(FLAGS.N)])
+  print('pi={}'.format(pi_true))
 
-inference = ed.KLqp({pi: qpi}, data={z: z_data})
-inference.run(n_iter=1500, n_samples=30)
+  # MODEL
+  pi = Dirichlet(tf.ones(4))
+  z = Categorical(probs=tf.ones([FLAGS.N, 1]) * pi)
 
-sess = ed.get_session()
-print('Inferred pi={}'.format(sess.run(qpi.mean())))
+  # INFERENCE
+  qpi = Dirichlet(tf.nn.softplus(
+      tf.get_variable("qpi/concentration", [FLAGS.K])))
+
+  inference = ed.KLqp({pi: qpi}, data={z: z_data})
+  inference.run(n_iter=1500, n_samples=30)
+
+  sess = ed.get_session()
+  print('Inferred pi={}'.format(sess.run(qpi.mean())))
+
+if __name__ == "__main__":
+  tf.app.run()