In [1]:
import numpy as np
%load_ext autoreload
%autoreload 2
from gridworld_utils import print_grid_values

# Policy Evaluation

Finding the state-values for a given policy in some gridworld where 
* There are 4 actions; up, down, left, right. (If you move into the edge you stay where you are.)
* You get a reward of -1 for each step
* Terminal states have a value of 0

The first example is a 4x4 gridworld with two corner terminal states, like the example in chapter 4.1 of Sutton and Barto's 'Rienforcement Learning: An Introduction'.

In [2]:
def update_state_value(policy, state_values, reward, discount):
    v = 0
    # Bellman Expectation Equation for deterministic environment. (Taking
    # some action in a state will always lead to the same successor state).
    for p, s_v in zip(policy, state_values):
        v += p * (reward + discount * s_v)
    return v

In [4]:
def one_sweep_policy_evaluation(grid, policy, terminal_states):
    n = grid.shape[0]
    new_grid = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            # Making sure not to update terminal states
            is_terminal_state = False
            for terminal_state in terminal_states:
                x, y = terminal_state
                if i == x and j ==y:
                    is_terminal_state = True
            if is_terminal_state:
                continue
                
            # Finding the values for each possible successor state
            # when moving up or down.
            if i == 0:
                v_up = grid[i][j]
                v_down = grid[i+1][j]
            elif i == (n-1):
                v_up = grid[i-1][j]
                v_down = grid[i][j]
            else:
                v_up = grid[i-1][j]
                v_down = grid[i+1][j]
            # Finding the values for each possible successor state
            # when moving left or right.
            if j == 0:
                v_left = grid[i][j]
                v_right = grid[i][j+1]
            elif j == (n-1):
                v_left = grid[i][j-1]
                v_right = grid[i][j]
            else:
                v_left = grid[i][j-1]
                v_right = grid[i][j+1]
            
            new_grid[i][j] = update_state_value(policy, (v_up,v_down,v_left,v_right), -1, 1)
    return new_grid
    

In [29]:
gridworld_4x4 = np.zeros((4,4))
policy = (1/4, 1/4, 1/4, 1/4) # Random Policy
terminal_states = ((0,0), (3,3)) # Top left and bottom right corners

for k in range(1, 101):
    gridworld_4x4 = one_sweep_policy_evaluation(gridworld_4x4, policy, terminal_states)
    if k <= 3 or k == 10 or k == 100:
        print('Value states after iteration {}:'.format(k))
        print_grid_values(gridworld_4x4)
        print('')

Value states after iteration 1:
|//////|-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |//////|

Value states after iteration 2:
|//////|-1.8  |-2.0  |-2.0  |
|-1.8  |-2.0  |-2.0  |-2.0  |
|-2.0  |-2.0  |-2.0  |-1.8  |
|-2.0  |-2.0  |-1.8  |//////|

Value states after iteration 3:
|//////|-2.4  |-2.9  |-3.0  |
|-2.4  |-2.9  |-3.0  |-2.9  |
|-2.9  |-3.0  |-2.9  |-2.4  |
|-3.0  |-2.9  |-2.4  |//////|

Value states after iteration 10:
|//////|-6.1  |-8.4  |-9.0  |
|-6.1  |-7.7  |-8.4  |-8.4  |
|-8.4  |-8.4  |-7.7  |-6.1  |
|-9.0  |-8.4  |-6.1  |//////|

Value states after iteration 100:
|//////|-13.9 |-19.9 |-21.9 |
|-13.9 |-17.9 |-19.9 |-19.9 |
|-19.9 |-19.9 |-17.9 |-13.9 |
|-21.9 |-19.9 |-13.9 |//////|



In [34]:
gridworld_6x6 = np.zeros((6,6))
policy = (1/4, 1/4, 1/4, 1/4) # Random Policy
terminal_states = ((2,2), (2,3), (3,2), (3,3)) # The middle 4 squares

for k in range(1, 101):
    gridworld_6x6 = one_sweep_policy_evaluation(gridworld_6x6, policy, terminal_states)
    if k <= 3 or k == 10 or k == 100:
        print('Value states after iteration {}:'.format(k))
        print_grid_values(gridworld_6x6)
        print('')

Value states after iteration 1:
|-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |//////|//////|-1.0  |-1.0  |
|-1.0  |-1.0  |//////|//////|-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |
|-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |-1.0  |

Value states after iteration 2:
|-2.0  |-2.0  |-2.0  |-2.0  |-2.0  |-2.0  |
|-2.0  |-2.0  |-1.8  |-1.8  |-2.0  |-2.0  |
|-2.0  |-1.8  |//////|//////|-1.8  |-2.0  |
|-2.0  |-1.8  |//////|//////|-1.8  |-2.0  |
|-2.0  |-2.0  |-1.8  |-1.8  |-2.0  |-2.0  |
|-2.0  |-2.0  |-2.0  |-2.0  |-2.0  |-2.0  |

Value states after iteration 3:
|-3.0  |-3.0  |-2.9  |-2.9  |-3.0  |-3.0  |
|-3.0  |-2.9  |-2.4  |-2.4  |-2.9  |-3.0  |
|-2.9  |-2.4  |//////|//////|-2.4  |-2.9  |
|-2.9  |-2.4  |//////|//////|-2.4  |-2.9  |
|-3.0  |-2.9  |-2.4  |-2.4  |-2.9  |-3.0  |
|-3.0  |-3.0  |-2.9  |-2.9  |-3.0  |-3.0  |

Value states after iteration 10:
|-9.4  |-8.9  |-8.3  |-8.3  |-8.9  |-9.4  |
|-8.9  |-7.9  |-6.2  |-6.2  |-7.

# Policy Iteration