# Sutton & Barto Book

## Chapter 4: Dynamic Programming

<IMG SRC="images/gridworld.png">


### Exercise 4.1

In Example 4.1, if $\pi$ is the equiprobable random policy,

- What is $q_\pi(11,down)$?
- What is $q_\pi(7,down)$?

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
grid_shape = (4, 4)
terminal_state = [(0,0), (3, 3)]

states = []
for row in range(grid_shape[0]):
    for col in range(grid_shape[1]):
        p = (row, col)
        if not p in terminal_state[1]:
            states.append(p)
print(states)

[(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)]


In [3]:
actions = {
            'l': (0, -1),
            'u': (-1, 0),
            'r': (0, 1),
            'd': (1, 0)
        }
actions

{'l': (0, -1), 'u': (-1, 0), 'r': (0, 1), 'd': (1, 0)}

In [4]:
values = np.zeros(len(states))
values

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [5]:
def show_values(values, r=1):
    
    v = np.zeros(grid_shape)
    for i, s in enumerate(states):
        v[s] = values[i]
    print(v.round(r))

In [6]:
q_values = {}
for s in states:
    for a in actions:
        q_values[(s, a)] = 0.0
list(q_values.items())[0:3]

[(((0, 0), 'l'), 0.0), (((0, 0), 'u'), 0.0), (((0, 0), 'r'), 0.0)]

In [7]:
def move(s, a):

    if s not in terminal_state:
        s2 = tuple(np.array(s) + np.array(actions[a]))

        # Check if out of bounds
        if (s2[0] < 0) or (s2[0] >= grid_shape[0]) or \
            (s2[1] < 0) or (s2[1] >= grid_shape[1]):
            s2 = s
        else:
            s = s2
        r = -1
    if s in terminal_state:
        s = terminal_state[0]
        r = 0

    return s, r

move((0, 1), 'r')

((0, 2), -1)

In [8]:
transitions = {}

for s in states:
    for a in actions:
        transitions[(s, a)] = move(s, a)

list(transitions.items())

[(((0, 0), 'l'), ((0, 0), 0)),
 (((0, 0), 'u'), ((0, 0), 0)),
 (((0, 0), 'r'), ((0, 0), 0)),
 (((0, 0), 'd'), ((0, 0), 0)),
 (((0, 1), 'l'), ((0, 0), 0)),
 (((0, 1), 'u'), ((0, 1), -1)),
 (((0, 1), 'r'), ((0, 2), -1)),
 (((0, 1), 'd'), ((1, 1), -1)),
 (((0, 2), 'l'), ((0, 1), -1)),
 (((0, 2), 'u'), ((0, 2), -1)),
 (((0, 2), 'r'), ((0, 3), -1)),
 (((0, 2), 'd'), ((1, 2), -1)),
 (((0, 3), 'l'), ((0, 2), -1)),
 (((0, 3), 'u'), ((0, 3), -1)),
 (((0, 3), 'r'), ((0, 3), -1)),
 (((0, 3), 'd'), ((1, 3), -1)),
 (((1, 0), 'l'), ((1, 0), -1)),
 (((1, 0), 'u'), ((0, 0), 0)),
 (((1, 0), 'r'), ((1, 1), -1)),
 (((1, 0), 'd'), ((2, 0), -1)),
 (((1, 1), 'l'), ((1, 0), -1)),
 (((1, 1), 'u'), ((0, 1), -1)),
 (((1, 1), 'r'), ((1, 2), -1)),
 (((1, 1), 'd'), ((2, 1), -1)),
 (((1, 2), 'l'), ((1, 1), -1)),
 (((1, 2), 'u'), ((0, 2), -1)),
 (((1, 2), 'r'), ((1, 3), -1)),
 (((1, 2), 'd'), ((2, 2), -1)),
 (((1, 3), 'l'), ((1, 2), -1)),
 (((1, 3), 'u'), ((0, 3), -1)),
 (((1, 3), 'r'), ((1, 3), -1)),
 (((1, 3), 'd'

In [9]:
def greedy_policy_with_values(state, action, values):
    
    action_values = [values[states.index(transitions[(s, a)][0])] for a in actions]
    a_greedy = list(actions.keys())[action_values.index(max(action_values))]

    return 1 if action == a_greedy else 0

In [10]:
def random_policy(state, action, values):

    return 1.0/len(actions)

In [11]:
def bellman_equation(policy, states, actions, transitions,  
                     values, lr):

    v = values.copy()
    for s in states:
        
        sum_values = 0
        for a in actions:
            
            s2, r = transitions[(s, a)]
            v2 = values[states.index(s2)]
            p = policy(s, a, values)
            sum_values += p*(r + lr*v2)
            
            #import pdb; pdb.set_trace()

        v[states.index(s)] = sum_values
    
    return v

In [12]:
def evaluate_policy(policy, states, actions, transitions,  
                    values, lr=1.0, theta=0.01, max_iter=1000,
                    show=True):

    iteration = 0
    if show:
            print("\nk = ", iteration)
            show_values(values)

    while iteration < max_iter:

        updated_values = bellman_equation(policy, states, actions, transitions,  
                                          values, lr=lr)

        delta = np.abs(updated_values - values).max()
        
        values[:] = updated_values
        iteration += 1
        
        if show:
            print("\nk =", iteration)
            show_values(values)

        if delta < theta:
            break
    
    if iteration == max_iter:
        print("\nMaximum iterations reached.")
    else:
        print("\nConverged to delta < %f" % theta)

### 1. Equi-probable random policy

In [13]:
values = np.zeros(len(states))

In [14]:
evaluate_policy(random_policy, states, actions, transitions, 
                values, max_iter=10, show=True)


k =  0
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

k =  1
[[ 0.  -0.8 -1.  -1. ]
 [-0.8 -1.  -1.  -1. ]
 [-1.  -1.  -1.  -0.8]
 [-1.  -1.  -0.8  0. ]]

k =  2
[[ 0.  -1.4 -1.9 -2. ]
 [-1.4 -1.9 -2.  -1.9]
 [-1.9 -2.  -1.9 -1.4]
 [-2.  -1.9 -1.4  0. ]]

k =  3
[[ 0.  -2.1 -2.8 -3. ]
 [-2.1 -2.7 -2.9 -2.8]
 [-2.8 -2.9 -2.7 -2.1]
 [-3.  -2.8 -2.1  0. ]]

k =  4
[[ 0.  -2.7 -3.7 -3.9]
 [-2.7 -3.5 -3.8 -3.7]
 [-3.7 -3.8 -3.5 -2.7]
 [-3.9 -3.7 -2.7  0. ]]

k =  5
[[ 0.  -3.2 -4.5 -4.8]
 [-3.2 -4.2 -4.6 -4.5]
 [-4.5 -4.6 -4.2 -3.2]
 [-4.8 -4.5 -3.2  0. ]]

k =  6
[[ 0.  -3.7 -5.3 -5.7]
 [-3.7 -4.9 -5.4 -5.3]
 [-5.3 -5.4 -4.9 -3.7]
 [-5.7 -5.3 -3.7  0. ]]

k =  7
[[ 0.  -4.2 -6.  -6.5]
 [-4.2 -5.5 -6.1 -6. ]
 [-6.  -6.1 -5.5 -4.2]
 [-6.5 -6.  -4.2  0. ]]

k =  8
[[ 0.  -4.7 -6.7 -7.2]
 [-4.7 -6.2 -6.8 -6.7]
 [-6.7 -6.8 -6.2 -4.7]
 [-7.2 -6.7 -4.7  0. ]]

k =  9
[[ 0.  -5.1 -7.4 -8. ]
 [-5.1 -6.7 -7.4 -7.4]
 [-7.4 -7.4 -6.7 -5.1]
 [-8.  -7.4 -5.1  0. ]]

k =  10
[[ 0.  -5.6 -

### 2. Greedy policy

In [22]:
values = np.zeros(len(states))

In [23]:
evaluate_policy(greedy_policy_with_values, states, actions, transitions, 
                values, theta=0.01, lr=0.25, max_iter=1000, show=False)


Converged to delta < 0.010000


In [24]:
show_values(values, r=2)

[[ 0.    0.   -1.   -1.25]
 [-1.33 -1.33 -1.33 -1.33]
 [-1.33 -1.33 -1.33 -1.33]
 [-1.33 -1.33 -1.33  0.  ]]
