$q_{\pi}(11, down) = E_{\pi} [G_t \mid 11, down] = E_{\pi} [ R_{terminal} \mid 11, down ] = 1$

$$ 
\begin{split}
q_{\pi}(7, down) &= \sum_{s', r} p(s', r \mid 7, down) \left[r + \max_{a'}q_{\pi}(s',a') \right] \\
 &=  p(11 , -1 \mid 7, down) \left[ -1  + \max_{a'}q_{\pi}(11 ,a') \right]  \\
 &= -1 + \max_{a'} q_{\pi}(11, a') , \mbox{ since }(p(11 , -1 \mid 7, down) = 1) \\
 &= 0,  \mbox{ since a' = down}
\end{split}
$$

In [39]:
import numpy as np
import pandas as pd

tolerant = 0.0001

def reward(s_prime):
    if s_prime == 0 or s_prime == 15:
        return 1
    else:
        return -1
    
def move(s, a):
    if a == 0:  # up
        return  s if s - 4 < 0 else s - 4
    if a == 1:  # right
        return s if (s + 1) % 4 == 0 else s + 1  #check if s is on the right edge
    if a == 2:  # down
        return s if s + 4 > 15 else s + 4
    if a == 3:  # left
        return s if s % 4 == 0 else s - 1 # check if s is on the left edge

q = np.zeros((16, 4))

while True:
    bound = -1 
    for s in range(1, 15):  # state 1 to 14,  don't need to loop the terminal states
        for a in range(0, 4): # a = 0 up, a = 1 right, a = 2 down, a = 3 left
            s_prime = move(s, a)
            new_q_value = reward(s_prime)
            if not (s_prime == 0 or s_prime == 15):  # if s_prime is terminal, we don't include their q values.
                for a_prime in range(0, 4):
                    new_q_value += .25 * q[s_prime][a_prime]

            d = abs(new_q_value - q[s][a])
            bound = max(d, bound)
            q[s][a] = new_q_value
    if bound <= tolerant:
        break;

q = np.matrix.round(q)
print(pd.DataFrame(q, columns=["Up", "Right", "Down", "Left"]))      

v = np.zeros((16,))
while True:
    bound = -1
    for s in range(1, 15):
        new_v = 0
        for a in range(0, 4):
            s_prime = move(s, a)
            new_v += 0.25 * (reward(s_prime) + v[s_prime])

        d = abs(new_v - v[s])
        bound = max(d, bound)
        v[s] = new_v
    if bound <= tolerant:
        break;

v = np.matrix.round(v)
print(pd.DataFrame(v.reshape((4,4)), columns=["0","1","2","3"]))    

      Up  Right  Down  Left
0    0.0    0.0   0.0   0.0
1  -13.0  -19.0 -17.0   1.0
2  -19.0  -21.0 -19.0 -13.0
3  -21.0  -21.0 -19.0 -19.0
4    1.0  -17.0 -19.0 -13.0
5  -13.0  -19.0 -19.0 -13.0
6  -19.0  -19.0 -17.0 -17.0
7  -21.0  -19.0 -13.0 -19.0
8  -13.0  -19.0 -21.0 -19.0
9  -17.0  -17.0 -19.0 -19.0
10 -19.0  -13.0 -13.0 -19.0
11 -19.0  -13.0   1.0 -17.0
12 -19.0  -19.0 -21.0 -21.0
13 -19.0  -13.0 -19.0 -21.0
14 -17.0    1.0 -13.0 -19.0
15   0.0    0.0   0.0   0.0
      0     1     2     3
0   0.0 -12.0 -18.0 -20.0
1 -12.0 -16.0 -18.0 -18.0
2 -18.0 -18.0 -16.0 -12.0
3 -20.0 -18.0 -12.0   0.0


In [72]:
def gen(a, b):
    return np.random.randint(low=a, high=b)

def gen_init_state():
    return gen(1, 15)

def gen_action():
    return gen(0, 4)

def gen_ep():
    s_ep = [gen_init_state()]
    r_ep = []
    while s_ep[-1] != 0 and s_ep[-1] != 15:
        s = s_ep[-1]
        a = gen_action()
        s_prime = move(s, a)
        r = reward(s_prime)
        s_ep.append(s_prime)
        r_ep.append(r)
    
        
    return (s_ep, r_ep)

v = np.zeros((16, ))
epoch = 20000
gamma = 1
returns_by_s = {}
returns_by_s_counter = {}
for ep in range(epoch):
    # generate ep
    s_ep, r_ep = gen_ep()
    g = 0
    s_seen = {}
    for i in range(len(r_ep) - 1, -1, -1):
        g = gamma * g + r_ep[i]
        if s_ep[i] not in s_seen:
            s_seen[s_ep[i]] = True
            
            if s_ep[i] not in returns_by_s:
                returns_by_s_counter[s_ep[i]] = 1
                returns_by_s[s_ep[i]] = g
            else:
                returns_by_s_counter[s_ep[i]] += 1
                returns_by_s[s_ep[i]] = returns_by_s[s_ep[i]] + (g - returns_by_s[s_ep[i]]) / returns_by_s_counter[s_ep[i]]
            
for s in returns_by_s:
    v[s] = returns_by_s[s]

v = np.matrix.round(v, 2)
print(v.reshape((4,4)))

[[  0.    -5.62  -8.33 -10.33]
 [ -5.49  -6.51  -7.17  -8.13]
 [ -8.16  -7.1   -6.49  -5.55]
 [-10.11  -8.03  -5.48   0.  ]]
