In [1]:
import numpy as np

import sys
sys.path.append('../utils')
from WindyGridPenalized import get_windy_grid_penalized
ACTIONS = ['L', 'R', 'U', 'D']

THRESHOLD = 1e-3
GAMMA = .9

def printValues(values, g):
    # values are a dictionary of tuples with the value being the probability
    # g is the gridWorld
    for i in range(g.rows):
        print("-------------------------")
        for j in range(g.cols):
            v = values.get((i, j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")
    

def printPolicy(policy, g):
    for i in range(g.rows):
        print("-------------------------")
        for j in range(g.cols):
            p = policy.get((i, j), ' ')
            print(" %s |" % p, end="")
        print("")

In [2]:
def get_transition_probs_rewards(grid):
    transition_probs = {}
    rewards = {}

    for (s, a), v in grid.probs.items():
        for s2, p in v.items():
            transition_probs[(s, a, s2)] = p
            rewards[(s, a, s2)] = grid.rewards.get(s2, 0)

    return transition_probs, rewards

def evaluate_deterministic_policy(grid, policy):
    V = {}

    for s in grid.all_states():
        V[s] = 0

    it = 0
    while True:
        biggest_change = 0
        for s in grid.all_states():
            if not grid.is_terminal(s):
                old_v = V[s]
                new_v = 0
                for a in ACTIONS:
                    for s2 in grid.all_states():
                        action_prob = 1 if policy.get(s) == a else 0

                        r = rewards.get((s, a, s2), 0)
                        new_v += action_prob * transition_probs.get((s, a, s2), 0) * (r + GAMMA * V[s2])

                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))

        it += 1

        if biggest_change < THRESHOLD:
            break

    return V

In [3]:
grid = get_windy_grid_penalized(-.2)
transition_probs , rewards = get_transition_probs_rewards(grid)

print("rewards:")
printValues(grid.rewards, grid)

policy = {}
for s in grid.actions.keys():
    policy[s] = np.random.choice(ACTIONS)

print("initial policy:")
printPolicy(policy, grid)

while True:
    V = evaluate_deterministic_policy(grid, policy)

    is_policy_converged = True
    for s in grid.actions.keys():
        old_a = policy[s]
        new_a = None
        best_value = float('-inf')

        for a in ACTIONS:
            v = 0
            for s2 in grid.all_states():
                r = rewards.get((s, a, s2), 0)

                # Bellman equation
                v += transition_probs.get((s, a, s2), 0) * (r + GAMMA * V[s2])

            if v > best_value:
                best_value = v
                new_a = a

        policy[s] = new_a
        if new_a != old_a:
            is_policy_converged = False
    if is_policy_converged:
        break

print("values:")
printValues(V, grid)
print("policy:")
printPolicy(policy, grid)

rewards:
-------------------------
-0.20|-0.20|-0.20| 1.00|
-------------------------
-0.20| 0.00|-0.20|-1.00|
-------------------------
-0.20|-0.20|-0.20|-0.20|
initial policy:
-------------------------
 D | L | U |   |
-------------------------
 R |   | U |   |
-------------------------
 D | U | U | U |
values:
-------------------------
 0.43| 0.70| 1.00| 0.00|
-------------------------
 0.19| 0.00|-0.15| 0.00|
-------------------------
-0.03|-0.23|-0.34|-0.50|
policy:
-------------------------
 R | R | R |   |
-------------------------
 U |   | U |   |
-------------------------
 U | L | U | L |
