### Policy Evaluation given a policy on a GridWorld example

In [108]:
import numpy as np
from gridworld import GridworldEnv
import copy

In [17]:
env = GridworldEnv()

####     Args of environment:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities (system probability) of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.

In [19]:
env.nA

4

In [122]:
env.P

{0: {0: [(1.0, 0, 0.0, True)],
  1: [(1.0, 0, 0.0, True)],
  2: [(1.0, 0, 0.0, True)],
  3: [(1.0, 0, 0.0, True)]},
 1: {0: [(1.0, 1, -1.0, False)],
  1: [(1.0, 2, -1.0, False)],
  2: [(1.0, 5, -1.0, False)],
  3: [(1.0, 0, -1.0, True)]},
 2: {0: [(1.0, 2, -1.0, False)],
  1: [(1.0, 3, -1.0, False)],
  2: [(1.0, 6, -1.0, False)],
  3: [(1.0, 1, -1.0, False)]},
 3: {0: [(1.0, 3, -1.0, False)],
  1: [(1.0, 3, -1.0, False)],
  2: [(1.0, 7, -1.0, False)],
  3: [(1.0, 2, -1.0, False)]},
 4: {0: [(1.0, 0, -1.0, True)],
  1: [(1.0, 5, -1.0, False)],
  2: [(1.0, 8, -1.0, False)],
  3: [(1.0, 4, -1.0, False)]},
 5: {0: [(1.0, 1, -1.0, False)],
  1: [(1.0, 6, -1.0, False)],
  2: [(1.0, 9, -1.0, False)],
  3: [(1.0, 4, -1.0, False)]},
 6: {0: [(1.0, 2, -1.0, False)],
  1: [(1.0, 7, -1.0, False)],
  2: [(1.0, 10, -1.0, False)],
  3: [(1.0, 5, -1.0, False)]},
 7: {0: [(1.0, 3, -1.0, False)],
  1: [(1.0, 7, -1.0, False)],
  2: [(1.0, 11, -1.0, False)],
  3: [(1.0, 6, -1.0, False)]},
 8: {0: [(1.0, 4

### Policy Evaluation Algorithm (Sutton & Barto) -

![sutton_barto_policy_evaluation.png](attachment:sutton_barto_policy_evaluation.png)


Implementation - 
<a href="https://www.codecogs.com/eqnedit.php?latex=\dpi{150}&space;\fn_cm&space;\small&space;Repeat\&space;till\&space;convergence&space;\{&space;\newline&space;V(i)&space;=&space;\sum&space;_{u&space;\in&space;actions(i)}(PolicyProb(u|i))\sum&space;_{j&space;\in&space;nextState(i)}&space;(R(i,u,j)&space;&plus;&space;disc*V(j))*(SystemProb(i,u,j))&space;\newline&space;\}" target="_blank"><img src="https://latex.codecogs.com/gif.latex?\dpi{150}&space;\fn_cm&space;\small&space;Repeat\&space;till\&space;convergence&space;\{&space;\newline&space;V(i)&space;=&space;\sum&space;_{u&space;\in&space;actions(i)}(PolicyProb(u|i))\sum&space;_{j&space;\in&space;nextState(i)}&space;(R(i,u,j)&space;&plus;&space;disc*V(j))*(SystemProb(i,u,j))&space;\newline&space;\}" title="\small Repeat\ till\ convergence \{ \newline V(i) = \sum _{u \in actions(i)}(PolicyProb(u|i))\sum _{j \in nextState(i)} (R(i,u,j) + disc*V(j))*(SystemProb(i,u,j)) \newline \}" /></a>

In [127]:
def policy_eval(policy, env, discount_factor, theta):
    V = np.zeros(env.nS)
    k=0
    while True:
        V_old = copy.deepcopy(V)
        for state in range(env.nS):
            agg_value = 0
            for action in range(env.nA):
                for prob_system,next_state,reward,done in env.P[state][action]:
                    agg_value += (V[next_state]*discount_factor + reward)*prob_system
            agg_value *= policy[state][action]
            V[state] = agg_value
        change = np.linalg.norm(np.abs(V - V_old))
        if change<theta:
            break
    return np.array(V)

In [128]:
given_policy = np.ones([env.nS, env.nA]) / env.nA

In [129]:
given_policy

array([[0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25]])

In [130]:
policy_eval(given_policy,env,1,0.00001)

array([  0.        , -13.99997737, -19.99996746, -21.99996418,
       -13.99997737, -17.99997222, -19.99996984, -19.99997019,
       -19.99996746, -19.99996984, -17.99997455, -13.999981  ,
       -21.99996418, -19.99997019, -13.999981  ,   0.        ])

In [95]:
#from dennybritz repo
expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])