# Solving Bellmann equation on a simple example

![gridworld](images/gridworld.png)

In [1]:
import numpy as np

In [2]:
# We need a matrix to store the transition probabilities. (transition matrix)

T_sas = np.zeros((16, 16, 4)) # we have 16 states, 4 actions

H = np.array([x for x in range(16)])
T_sas[H, H - (H % 4 != 0), 0] = 1 # left
T_sas[H, H - 4*(H-4 >= 0), 1] = 1 # up
T_sas[H, H + ((H+1) % 4 != 0), 2] = 1 # right
T_sas[H, H + 4*(H+4 < 16), 3] = 1 # down
T_sas[0, :, :] = 0 # in the shaded area each action is pointless
T_sas[0, 0, :] = 1
T_sas[15, :, :] = 0
T_sas[15, 15, :] = 1

In [5]:
# reward
r_sas = -np.ones((16, 16, 4))

r_sas[0, 0, :] = 0
r_sas[15, 15, :] = 0

In [6]:
# Bellmann-equation
def bellman_operator(p_sa, v_s, gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    rTpi = np.sum(rT * p_sa, 1)
    
    pi = p_sa.reshape(16, 1, 4).repeat(16, 1)
    vT = np.sum(T_sas * pi, 2)
    vTpi = np.matmul(vT, v_s) * gamma
    return rTpi + vTpi

In [7]:
# policy improvement
def policy_improvement(v_s, gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    v = v_s.reshape(1, 16, 1).repeat(16, 0).repeat(4, 2)
    pi_idx = np.argmax(rT + gamma * np.sum(T_sas * v, 1), 1)
    pi = np.zeros((16, 4))
    pi[:, pi_idx] = 1
    return pi

In [8]:
# value iteration step
def value_iteration(gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    v = v_s.reshape(1, 16, 1).repeat(16, 0).repeat(4, 2)
    return np.max(rT + gamma * np.sum(T_sas * v, 1), 1)

In [9]:
# policy
def policy(gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    v = v_s.reshape(1, 16, 1).repeat(16, 0).repeat(4, 2)
    return np.argmax(rT + gamma * np.sum(T_sas * v, 1), 1)

In [101]:
# policy evaluation and policy improvement
gamma = 0.9
v_s = np.zeros(16)
p_sa = np.ones((16, 4)) * 0.25
for _ in range(10):
    v_s = bellman_operator(p_sa, v_s, gamma)
    p_sa = policy_improvement(v_s, gamma)
pi = policy(gamma)

In [102]:
pi.reshape((4, 4))

array([[0, 0, 0, 0],
       [1, 0, 0, 3],
       [1, 1, 3, 3],
       [1, 2, 2, 0]], dtype=int64)

In [103]:
v_s.reshape((4, 4)).astype(np.float32)

array([[      0.   , -119668.375, -176693.86 , -197046.77 ],
       [-119668.375, -156340.95 , -176046.45 , -176693.86 ],
       [-176693.86 , -176046.45 , -156340.95 , -119668.375],
       [-197046.77 , -176693.86 , -119668.375,       0.   ]],
      dtype=float32)

In [33]:
# Value iteration to solve the problem
gamma = 0.99
v_s = np.zeros(16)
p_sa = np.zeros((16, 4)) * 0.25
for _ in range(5):
    v_s = bellman_operator(p_sa, v_s, gamma)
    p_sa = policy_improvement(v_s, gamma)
pi = policy(gamma)

In [34]:
pi.reshape((4, 4))

array([[0, 0, 0, 0],
       [1, 1, 0, 3],
       [1, 1, 3, 3],
       [2, 2, 2, 0]], dtype=int64)

In [35]:
v_s.reshape((4, 4)).astype(np.int32)

array([[   0,  -97, -134, -142],
       [ -97, -127, -138, -134],
       [-134, -138, -127,  -97],
       [-142, -134,  -97,    0]])