# Good examples and tips in Edward

Some code that works and is useful, that I didn't want to throw away.

In [21]:
import numpy as np
import tensorflow as tf
import edward as ed
from pprint import pprint

In [5]:
# code from edward/examples/dirichlet_categorical.py:
# Inferring a categorical distribution with KLqp
tf.reset_default_graph()
sess = tf.InteractiveSession()
K = 4
N = 30
# DATA
pi_true = np.random.dirichlet(np.array([20.0, 30.0, 10.0, 10.0]))
z_data = np.array([np.random.choice(K, 1, p=pi_true)[0]
                   for n in range(N)])
print("pi: {}".format(pi_true))

# MODEL
pi = ed.models.Dirichlet(tf.ones(4))
z = ed.models.Categorical(probs=pi, sample_shape=N)

# INFERENCE
qpi = ed.models.Dirichlet(tf.nn.softplus(
    tf.get_variable("qpi/concentration", [K])))

inference = ed.KLqp({pi: qpi}, data={z: z_data})
inference.run(n_iter=1500, n_samples=30)

# sess = ed.get_session()
print("Inferred pi: {}".format(sess.run(qpi.mean())))

pi: [0.24454049 0.46987182 0.13792581 0.14766188]


  not np.issubdtype(value.dtype, np.float) and \
  not np.issubdtype(value.dtype, np.int) and \


1500/1500 [100%] ██████████████████████████████ Elapsed: 6s | Loss: 43.773
Inferred pi: [0.29658943 0.33174667 0.22883385 0.14283012]


In [6]:
# example from examples/bayesian_linear_regression_sghmc.py
tf.reset_default_graph()
sess = tf.InteractiveSession()
def build_toy_dataset(N, noise_std=0.5):
    X = np.concatenate([np.linspace(0, 2, num=N / 2),
                        np.linspace(6, 8, num=N / 2)])
    y = 2.0 * X + 10 * np.random.normal(0, noise_std, size=N)
    X = X.astype(np.float32).reshape((N, 1))
    y = y.astype(np.float32)
    return X, y

N = 40  # number of data points
D = 1  # number of features
# DATA
X_train, y_train = build_toy_dataset(N)
X_test, y_test = build_toy_dataset(N)

# MODEL
X = tf.placeholder(tf.float32, [N, D])
w = ed.models.Normal(loc=tf.zeros(D), scale=tf.ones(D))
b = ed.models.Normal(loc=tf.zeros(1), scale=tf.ones(1))
y = ed.models.Normal(loc=ed.dot(X, w) + b, scale=tf.ones(N))

# INFERENCE
T = 5000                        # Number of samples.
nburn = 100                     # Number of burn-in samples.
stride = 10                     # Frequency with which to plot samples.
qw = ed.models.Empirical(params=tf.Variable(tf.random_normal([T, D])))
qb = ed.models.Empirical(params=tf.Variable(tf.random_normal([T, 1])))

inference = ed.SGHMC({w: qw, b: qb}, data={X: X_train, y: y_train})
inference.run(step_size=1e-3)

  """
  


5000/5000 [100%] ██████████████████████████████ Elapsed: 8s | Acceptance Rate: 1.000


### Building the Markov Chain:

**HMM, code from github issue, working:**

In [40]:
# from issue https://github.com/blei-lab/edward/issues/450
from edward.models import Categorical, Dirichlet, Uniform, Mixture

chain_len = 30
n_hidden = 3
n_obs = 3

x_0 = Categorical(probs=tf.fill([n_hidden], 1.0 / n_hidden))

# transition matrix
T = tf.nn.softmax(tf.Variable(tf.random_uniform([n_hidden, n_hidden])), dim=0)

# emission matrix
E = tf.nn.softmax(tf.Variable(tf.random_uniform([n_hidden, n_obs])), dim=0)

# MODEL
x = []
y = []
for _ in range(chain_len):
    x_tm1 = x[-1] if x else x_0
    x_t = Categorical(probs=T[x_tm1, :])
    y_t = Categorical(probs=E[x_t, :])
    x.append(x_t)
    y.append(y_t)

In [39]:
# INFERENCE
qx = [Categorical(probs=tf.nn.softmax(tf.Variable(tf.ones(n_hidden))))
      for _ in range(chain_len)]

y_data = ([0] * 10) + ([1] * 10) + ([2] * 10)
y_data = map(np.array, y_data)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(T))

    inference = ed.KLqp(dict(zip(x, qx)), dict(zip(y, y_data)))
    inference.run(n_iter=20000)

    print(sess.run(T))
    print(sess.run(E))
    print(sess.run([foo.probs for foo in qx]))
    print(sess.run([foo.probs for foo in y]))

[[0.3364298  0.33027482 0.32518175]
 [0.31573606 0.33219975 0.2646786 ]
 [0.34783414 0.33752543 0.41013962]]
20000/20000 [100%] ██████████████████████████████ Elapsed: 110s | Loss: 10.638
[[0.88519555 0.10100932 0.04295716]
 [0.00688317 0.8933158  0.03535395]
 [0.10792119 0.00567483 0.9216889 ]]
[[0.00692124 0.9866714  0.00451739]
 [0.00276657 0.00459809 0.993371  ]
 [0.9903122  0.00873054 0.00211151]]
[array([0.00655151, 0.00263566, 0.99081284], dtype=float32), array([0.00522245, 0.00172166, 0.9930559 ], dtype=float32), array([0.00439227, 0.00145458, 0.9941532 ], dtype=float32), array([0.00415663, 0.00165063, 0.9941928 ], dtype=float32), array([0.00509683, 0.00291023, 0.99199295], dtype=float32), array([0.00467363, 0.00116577, 0.9941606 ], dtype=float32), array([0.00453563, 0.00167482, 0.9937895 ], dtype=float32), array([0.00417722, 0.0023592 , 0.99346364], dtype=float32), array([0.00486003, 0.00171187, 0.99342805], dtype=float32), array([0.01455432, 0.00245372, 0.982992  ], dtype=flo

**Note:** this example seems to converge to something better that whay the guy said in the github example.

*Given my 3 hidden states, 3 observation types, and long changes of identical observations, I expect transition matrix to be close to diagonal and the emission matrix to look like a permutation matrix. I see non-converging loss info:*

Iteration     1 [  0%]: Loss = 35.123
Iteration  2000 [ 10%]: Loss = 46.565
Iteration  4000 [ 20%]: Loss = 47.923
Iteration  6000 [ 30%]: Loss = 56.581
Iteration  8000 [ 40%]: Loss = 53.431
Iteration 10000 [ 50%]: Loss = 49.397
Iteration 12000 [ 60%]: Loss = 55.551
Iteration 14000 [ 70%]: Loss = 47.537
Iteration 16000 [ 80%]: Loss = 47.690
Iteration 18000 [ 90%]: Loss = 47.107
Iteration 20000 [100%]: Loss = 38.514

*, non-uniform state probability distributions, and very uniform observation probabilities. Any idea what's going wrong in my setup or the solving of the problem?*

What data did we pass in:

In [23]:
y_data = ([0] * 10) + ([1] * 10) + ([2] * 10)
# for each categorical var y, he associated this matrix:
np.array(y_data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

Transitions: you almost always stay in the same state. Emission: you almost always go to the same state, but it doesn't have to be the same number.

**Using the external loop like this seems to work, fixing the length of the chains is not a big problem (anyway at some point LC stops the loan anyway so they cannot run indefinitely), we could do that while thinking of how to make it more efficient inside TF.**

**HMM, initial code from github issue, not working:**

In [27]:
chain_len = 30
n_hidden = 3
n_obs = 3
x_0 = Categorical(probs=tf.fill([n_hidden], 1.0 / n_hidden))

# transition matrix
T = Uniform(tf.zeros([n_hidden, n_hidden]), tf.ones([n_hidden, n_hidden]))
T /= tf.reduce_sum(T, axis=0, keep_dims=True)

# emission matrix
E = Uniform(tf.zeros([n_obs, n_hidden]), tf.ones([n_obs, n_hidden]))
E /= tf.reduce_sum(E, axis=0, keep_dims=True)

# model
y_val = tf.placeholder(tf.int32, [chain_len])
x = tf.scan(lambda x_tm1, _: Categorical(probs=T[:, x_tm1]),
            y_val, initializer=x_0)
y = tf.map_fn(lambda x_t: Categorical(probs=E[:, x_t]), x)

In [29]:
# inference
qT = tf.Variable(Uniform(tf.zeros([n_hidden, n_hidden]), tf.ones([n_hidden, n_hidden])))
qT /= tf.reduce_sum(qT, axis=0, keep_dims=True)
qE = tf.Variable(Uniform(tf.zeros([n_obs, n_hidden]), tf.ones([n_obs, n_hidden])))
qE /= tf.reduce_sum(qE, axis=0, keep_dims=True)
px_init = tf.Variable(tf.random_uniform([chain_len, n_hidden]))
px_init /= tf.reduce_sum(px_init, axis=0, keep_dims=True)
qx = Categorical(probs=px_init)

y_data = np.array(([0] * 10) + ([1] * 10) + ([2] * 10), dtype=np.int32)

inference = ed.KLqp({T: qT, E: qE, x: qx}, {y_val: y_data})
try:
    inference.run()
except Exception as e:
    print(e)

'Tensor' object has no attribute 'reparameterization_type'


This seems similar to the github issue (**but not exactly the same error as back then)**. It's because many of these objects are not instances of RandomVariable... If we dig in more into how Edward works we might understand why exactly and if there is anyway to avoid this problem.

**Regular Markov Model, based on the code above (without the hidden states):**

In [34]:
# from issue https://github.com/blei-lab/edward/issues/450
chain_len = 30
n_obs = 3

x_0 = Categorical(probs=tf.fill([n_obs], 1.0 / n_obs))

# transition matrix
T = tf.nn.softmax(tf.Variable(tf.random_uniform([n_obs, n_obs])), dim=0)

# no more emissions, we observe directly the hidden states x

# MODEL
x = []
for _ in range(chain_len):
    x_tm1 = x[-1] if x else x_0
    x_t = Categorical(probs=T[:, x_tm1])
    x.append(x_t)

In [37]:
# INFERENCE
qx = [Categorical(probs=tf.nn.softmax(tf.Variable(tf.ones(n_obs))))
      for _ in range(chain_len)]

x_data = ([0] * 10) + ([1] * 10) + ([2] * 10)
x_data = map(np.array, x_data)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(T))

    inference = ed.KLqp(dict(zip(x, qx)), dict(zip(x, x_data)))
    inference.run(n_iter=20000)

    print(sess.run(T))
    print('qx:')
    pprint(sess.run([foo.probs for foo in qx]))
    print('x:')
    pprint(sess.run([foo.probs for foo in x]))

[[0.33800068 0.49874976 0.4133891 ]
 [0.34657052 0.261219   0.3996655 ]
 [0.31542882 0.24003124 0.1869454 ]]
20000/20000 [100%] ██████████████████████████████ Elapsed: 87s | Loss: 0.073
[[0.22719203 0.22574067 0.22214726]
 [0.25503868 0.25731292 0.26435006]
 [0.51776934 0.51694643 0.51350266]]
qx:
[array([0.21782018, 0.25254372, 0.52963614], dtype=float32),
 array([0.23656154, 0.24698423, 0.5164543 ], dtype=float32),
 array([0.23196688, 0.26875463, 0.49927852], dtype=float32),
 array([0.22545192, 0.268244  , 0.5063041 ], dtype=float32),
 array([0.2388351 , 0.2530424 , 0.50812244], dtype=float32),
 array([0.22195865, 0.27552202, 0.50251937], dtype=float32),
 array([0.22381088, 0.25734565, 0.5188435 ], dtype=float32),
 array([0.23569892, 0.25882402, 0.5054771 ], dtype=float32),
 array([0.2143386 , 0.24603303, 0.5396283 ], dtype=float32),
 array([0.23597272, 0.23802604, 0.5260012 ], dtype=float32),
 array([0.23711358, 0.24408403, 0.51880246], dtype=float32),
 array([0.22456102, 0.25244418