In [1]:
import numpy as np
%run Grid_world.ipynb
import import_ipynb
from Iterative_Policy_Evaluation import print_values, print_policy, fixed_policy

importing Jupyter notebook from Iterative Policy Evaluation.ipynb


In [2]:
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

In [8]:
if __name__ ==  "__main__":
    grid = negative_grid()
    
    # print rewards
    print("Rewards:")
    print_values(grid.rewards, grid)
    
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(grid.actions[s])  #Randomly choose a possible action
    # initial policy
    print("\nInitial policy:")
    print_policy(policy, grid)
    
    #Initialize state values for all states 
    V = {}
    states = grid.all_states()
    for s in states:
        V[s] = 0
    run_nbr  = 0
    
    while True:
        run_nbr = run_nbr + 1
        #Policy Evaluation
        biggest_change = 0
        for s in states:
            old_v = V[s]

            # V(s) only has value if it's not a terminal state
            if s in policy:
                new_v = float('-inf')
                for a in ALL_POSSIBLE_ACTIONS:
                    grid.set_state(s)
                    r = grid.move(a)
                    v = r + GAMMA * V[grid.current_state()] #Note: This doesn't use expected future gain over all actions 
                    if v > new_v:
                        new_v = v
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))
        #print("\nCurrent State Values:")
        #print_policy(V, grid)
        if biggest_change < SMALL_ENOUGH:
            print("Total Loops of Policy Iteration and Evaluation", run_nbr)
            break
        
    #Policy selection, runs only once outside the while loop
    policy_converged = True
    for state in states:
        if state in policy:
            old_action = policy[state]
            best_action = None
            best_value = float('-inf')
            for action in ALL_POSSIBLE_ACTIONS:
                grid.set_state(state)
                reward = grid.move(action)
                v_new = reward + GAMMA*V[grid.current_state()] #Check Reward for all possible actions in that state
                if v_new > best_value:
                    best_action = action
                    best_value = v_new
            policy[state] = best_action
            #V[state] = best_value   #Speeds up policy evluation 
            #if best_action != old_action:
            #    policy_converged = False
    # final policy
    print("\nFinal policy:")
    print_policy(policy, grid)

Rewards:
---------------------------
-0.10|-0.10|-0.10| 1.00|
---------------------------
-0.10| 0.00|-0.10|-1.00|
---------------------------
-0.10|-0.10|-0.10|-0.10|

Initial policy:
---------------------------
  D  |  L  |  D  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  R  |  L  |  R  |  U  |

Current State Values:
---------------------------
  -0.1  |  -0.1  |  1.0  |  0  |
---------------------------
  -0.1  |     |  -0.1  |  0  |
---------------------------
  -0.1  |  -0.1  |  -0.1  |  -0.1  |

Current State Values:
---------------------------
  0.6200000000000001  |  0.8  |  1.0  |  0  |
---------------------------
  0.4580000000000002  |     |  0.8  |  0  |
---------------------------
  -0.19  |  -0.19  |  0.6200000000000001  |  -0.19  |

Current State Values:
---------------------------
  0.6200000000000001  |  0.8  |  1.0  |  0  |
---------------------------
  0.4580000000000002  |     |  0.8  |  0  |
---------------------------
