In [1]:
import numpy as np
%load_ext autoreload
%autoreload 2
from gridworld_utils import print_grid

# Policy Evaluation

Finding the state-values for some given policy in a 4x4 gridworld; like the example in chapter 4.1 of Sutton and Barto's 'Rienforcement Learning: An Introduction' where 
* The top left and top right are terminal states 
* There are 4 actions; up, down, left, right. (If you move into the edge you stay where you are.)
* You get a reward of -1 for each step

In [2]:
def update_state_value(policy, state_values, reward, discount):
    v = 0
    # Bellman Expectation Equation for deterministic environment. (Taking
    # some action in a state will always lead to the same successor state).
    for p, s_v in zip(policy, state_values):
        v += p * (reward + discount * s_v)
    return v

In [3]:
def one_sweep_policy_evaluation(grid, policy):
    n = grid.shape[0]
    new_grid = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if (i == 0 and j == 0) or (i == n-1 and j == n-1):
                continue
                
            # Finding the values for each possible successor state
            # when moving up or down.
            if i == 0:
                v_up = grid[i][j]
                v_down = grid[i+1][j]
            elif i == (n-1):
                v_up = grid[i-1][j]
                v_down = grid[i][j]
            else:
                v_up = grid[i-1][j]
                v_down = grid[i+1][j]
            # Finding the values for each possible successor state
            # when moving left or right.
            if j == 0:
                v_left = grid[i][j]
                v_right = grid[i][j+1]
            elif j == (n-1):
                v_left = grid[i][j-1]
                v_right = grid[i][j]
            else:
                v_left = grid[i][j-1]
                v_right = grid[i][j+1]
            
            new_grid[i][j] = update_state_value(policy, (v_up,v_down,v_left,v_right), -1, 1)
    return new_grid
    

In [4]:
grid = np.zeros((4,4))

In [5]:
policy = (1/4, 1/4, 1/4, 1/4) # Random Policy
grid = one_sweep_policy_evaluation(grid, policy)
print_grid(grid)

|//////|-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |//////|


In [7]:
grid = np.zeros((4,4))
for k in range(1, 121):
    grid = one_sweep_policy_evaluation(grid, policy)
    if k <= 3 or k % 10 == 0:
        print('Value states after iteration {}:'.format(k))
        print_grid(grid)
        print('')

Value states after iteration 1:
|//////|-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |//////|

Value states after iteration 2:
|//////|-1.8  |-2.0  |-2.0  |
|-1.8  |-2.0  |-2.0  |-2.0  |
|-2.0  |-2.0  |-2.0  |-1.8  |
|-2.0  |-2.0  |-1.8  |//////|

Value states after iteration 3:
|//////|-2.4  |-2.9  |-3.0  |
|-2.4  |-2.9  |-3.0  |-2.9  |
|-2.9  |-3.0  |-2.9  |-2.4  |
|-3.0  |-2.9  |-2.4  |//////|

Value states after iteration 10:
|//////|-6.1  |-8.4  |-9.0  |
|-6.1  |-7.7  |-8.4  |-8.4  |
|-8.4  |-8.4  |-7.7  |-6.1  |
|-9.0  |-8.4  |-6.1  |//////|

Value states after iteration 20:
|//////|-9.4  |-13.3 |-14.5 |
|-9.4  |-12.1 |-13.3 |-13.3 |
|-13.3 |-13.3 |-12.1 |-9.4  |
|-14.5 |-13.3 |-9.4  |//////|

Value states after iteration 30:
|//////|-11.4 |-16.1 |-17.6 |
|-11.4 |-14.6 |-16.1 |-16.1 |
|-16.1 |-16.1 |-14.6 |-11.4 |
|-17.6 |-16.1 |-11.4 |//////|

Value states after iteration 40:
|//////|-12.5 |-17.7 |-19.5 |
|-12.5 |-16.0 |-

# Policy Iteration