In [7]:
import numpy as np
%run Grid_world.ipynb
import import_ipynb
from Iterative_Policy_Evaluation import print_values, print_policy, fixed_policy

In [8]:
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

In [24]:
def play_game(policy,grid):
    # returns a list of states and corresponding returns
    possible_start_states = list(grid.actions.keys())
    random_start_idx = np.random.choice(len(possible_start_states))
    grid.set_state(possible_start_states[random_start_idx])
    
    s = grid.current_state()
    states_rewards = [(s,0.0)] #Assign zero reward to starting state
    #Play game using the policy and collect states and rewards
    while not grid.game_over():
        action = policy[s]
        reward = grid.move(action)
        s = grid.current_state()
        states_rewards.append((s,reward))
        
    states_returns = []    
    G = 0
    first = True
    for s, r in reversed(states_rewards):
    # the value of the terminal state is 0 by definition
    # we should ignore the first state we encounter
    # and ignore the last G, which is meaningless since it doesn't correspond to any move
        if first:
            G = 0 #Terminal state has no future reward
            first = False
        else:
            states_returns.append((s,G))
        G = r + GAMMA*G
    
    states_returns.reverse()
    return states_returns

In [33]:
if __name__ ==  "__main__":
    grid = negative_grid()
    
    # print rewards
    print("rewards:")
    print_values(grid.rewards, grid)

    # state -> action
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U' }
    print("\nGiven Policy:")
    print_policy(policy, grid)
    
    # initialize V(s) and returns
    V = {}
    returns = {} # dictionary of state -> list of returns we've received
    states = grid.all_states()
    
    for s in states:
        if s in grid.actions:
            returns[s] = []
        else:
          # terminal state or state we can't otherwise get to
            V[s] = 0

    #play 100 games and avg the returns
    for i in range(100):
        states_returns = play_game(policy, grid)
        seen_states = set()
        for s, G in states_returns:
          # check if we have already seen s
          # called "first-visit" MC policy evaluation
            if s not in seen_states:
                returns[s].append(G)
                V[s] = np.mean(returns[s])
                seen_states.add(s)
    
    # final state values
    print("\nFinal Values:")
    print_values(V, grid)

rewards:
---------------------------
-0.10|-0.10|-0.10| 1.00|
---------------------------
-0.10| 0.00|-0.10|-1.00|
---------------------------
-0.10|-0.10|-0.10|-0.10|

Given Policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |

Final Values:
---------------------------
 0.62| 0.80| 1.00| 0.00|
---------------------------
 0.46| 0.00|-1.00| 0.00|
---------------------------
 0.31|-1.00|-1.00|-1.00|
