# Solving Bellmann equation on a simple example

![gridworld](images/gridworld.png)

In [1]:
import numpy as np

In [2]:
# We need a matrix to store the transition probabilities. (transition matrix)

T_sas = np.zeros((16, 16, 4)) # we have 16 states, 4 actions

H = np.array([x for x in range(16)])
T_sas[H, H - (H % 4 != 0), 0] = 1 # left
T_sas[H, H - 4*(H-4 >= 0), 1] = 1 # up
T_sas[H, H + ((H+1) % 4 != 0), 2] = 1 # right
T_sas[H, H + 4*(H+4 < 16), 3] = 1 # down
T_sas[0, :, :] = 0 # in the shaded area each action is pointless
T_sas[0, 0, :] = 1
T_sas[15, :, :] = 0
T_sas[15, 15, :] = 1

In [3]:
# reward
r_sas = -np.ones((16, 16, 4))

r_sas[0, 0, :] = 0
r_sas[15, 15, :] = 0

In [4]:
# Bellmann-equation
def bellman_operator(p_sa, v_s, gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    rTpi = np.sum(rT * p_sa, 1)
    
    pi = p_sa.reshape(16, 1, 4).repeat(16, 1)
    vT = np.sum(T_sas * pi, 2)
    vTpi = np.matmul(vT, v_s) * gamma
    return rTpi + vTpi

In [30]:
# policy improvement
def policy_improvement(v_s, gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    v = v_s.reshape(1, 16, 1).repeat(16, 0).repeat(4, 2)
    pi_idx = np.argmax(rT + gamma * np.sum(T_sas * v, 1), 1)
    pi = np.zeros((16, 4))
    pi[np.array([x for x in range(16)]), pi_idx] = 1
    return pi

In [31]:
# value iteration step
def value_iteration(gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    v = v_s.reshape(1, 16, 1).repeat(16, 0).repeat(4, 2)
    return np.max(rT + gamma * np.sum(T_sas * v, 1), 1)

In [32]:
# policy
def policy(v_s, gamma=0.98):
    rT = np.sum(r_sas * T_sas, 1)
    v = v_s.reshape(1, 16, 1).repeat(16, 0).repeat(4, 2)
    return np.argmax(rT + gamma * np.sum(T_sas * v, 1), 1)

In [48]:
# policy evaluation and policy improvement
gamma = 0.98
v_s = np.zeros(16)
p_sa = np.ones((16, 4)) * 0.25
for _ in range(50):
    for _ in range(20):
        v_s = bellman_operator(p_sa, v_s, gamma)
    p_sa = policy_improvement(v_s, gamma)
pi = policy(v_s, gamma)

In [49]:
pi.reshape((4, 4))

array([[0, 0, 0, 0],
       [1, 0, 0, 3],
       [1, 0, 2, 3],
       [1, 2, 2, 0]], dtype=int64)

In [50]:
v_s.reshape((4, 4)).astype(np.float32)

array([[ 0.    , -1.    , -1.98  , -2.9404],
       [-1.    , -1.98  , -2.9404, -1.98  ],
       [-1.98  , -2.9404, -1.98  , -1.    ],
       [-2.9404, -1.98  , -1.    ,  0.    ]], dtype=float32)

In [66]:
# Value iteration to solve the problem
gamma = 0.9
v_s = np.zeros(16)
p_sa = np.zeros((16, 4)) * 0.25
for _ in range(200):
    v_s = bellman_operator(p_sa, v_s, gamma)
    p_sa = policy_improvement(v_s, gamma)
pi = policy(v_s, gamma)

In [67]:
pi.reshape((4, 4))

array([[0, 0, 0, 0],
       [1, 0, 0, 3],
       [1, 0, 2, 3],
       [1, 2, 2, 0]], dtype=int64)

In [69]:
v_s.reshape((4, 4)).astype(np.float32)

array([[ 0.  , -1.  , -1.9 , -2.71],
       [-1.  , -1.9 , -2.71, -1.9 ],
       [-1.9 , -2.71, -1.9 , -1.  ],
       [-2.71, -1.9 , -1.  ,  0.  ]], dtype=float32)