[View in Colaboratory](https://colab.research.google.com/github/aztecman/Move37/blob/master/GridWorld101_ASCII.ipynb)

In [0]:
#This code borrows liberally from Jeremi Kaczmarczyk
#https://medium.com/harder-choices/dynamic-programming-in-python-reinforcement-learning-bb288d95288f

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import clear_output
from random import randint, random
from time import sleep

In [0]:
def print_board(agent_position):
    fields = list(range(12))
    board = "-----------------\n"
    for i in range(0, 12, 4):
        line = fields[i:i+4]
        for field in line:
            if field == agent_position:
                board += "| A "
            elif field == fields[3]:
                board += "|+1 "
            #elif field == fields[5]:
            #    board += "| X "
            elif field == fields[7]:
                board += "|-1 "
            else:
                board += "|   "
        board += "|\n"
        board += "-----------------\n"     
    print(board)

In [17]:
print_board(8)

-----------------
|   |   |   |+1 |
-----------------
|   |   |   |-1 |
-----------------
| A |   |   |   |
-----------------



In [0]:
def create_state_to_state_prime_verbose_map():
    state_to_state_prime = {}
    l = range(12)
    for i in range(12):
        if i == 3 or i == 7:
            state_to_state_prime[i] = {'N': 0, 'E': 0, 'S': 0, 'W': 0}
        elif i % 4 == 0:
            state_to_state_prime[i] = {'N': i - 4 if i - 4 in l else i, 'E': i + 1 if i + 1 in l else i, 'S': i + 4 if i + 4 in l else i, 'W': i}
        elif i % 4 == 3:
            state_to_state_prime[i] = {'N': i - 4 if i - 4 in l else i, 'E': i, 'S': i + 4 if i + 4 in l else i, 'W': i - 1 if i - 1 in l else i}
        else:
            state_to_state_prime[i] = {'N': i - 4 if i - 4 in l else i, 'E': i + 1 if i + 1 in l else i, 'S': i + 4 if i + 4 in l else i, 'W': i - 1 if i - 1 in l else i}

    return state_to_state_prime
  
def create_random_policy():
  return {i: {'N': 0.0, 'E': 0.0, 'S': 0.0, 'W': 0.0} if i == 3 or i == 7 else {'N': 0.25, 'E': 0.25, 'S': 0.25, 'W': 0.25} for i in range(12)} # [N, E, S, W]
  
def create_probability_map():
  state_to_state_prime = create_state_to_state_prime_verbose_map()
    
  probability_map = {}
    
  for state in range(12):
     for move in ["N", "E", "S", "W"]:
        for prime in range(12):
          probability_map[(prime, -1, state, move)] = 0 if prime != state_to_state_prime[state][move] else 1
            
  return probability_map

In [0]:
def agent(policy, starting_position=None, verbose=False):
    open_positions = [0, 1, 2, 4, 5, 6, 8, 9, 10, 11] # not 3 or 7
    state_to_state_prime = create_state_to_state_prime_verbose_map()
    agent_position = open_positions[randint(0, 9)] if starting_position is None else starting_position
        
    step_number = 1
    action_taken = None
    
    if verbose:
        print("Move: {} Position: {} Action: {}".format(step_number, agent_position, action_taken))
        print_board(agent_position)
        print("\n")
        sleep(2)
    
    while not (agent_position == 3 or agent_position == 7):
        if verbose:
            clear_output(wait=True)
            print("Move: {} Position: {} Action: {}".format(step_number, agent_position, action_taken))
            print_board(agent_position)
            print("\n")
            sleep(1)
        
        current_policy = policy[agent_position]
        next_move = random()
        lower_bound = 0
        for action, chance in current_policy.items():
            if chance == 0:
                continue
            if lower_bound <= next_move < lower_bound + chance:
                agent_position = state_to_state_prime[agent_position][action]
                action_taken = action
                break 
            lower_bound = lower_bound + chance
                
        step_number += 1   
                
    if verbose:
        clear_output(wait=True)
        print("Move: {} Position: {} Action: {}".format(step_number, agent_position, action_taken))
        print_board(agent_position)
        if agent_position == 3:
          print("Win!")
        else:
          print("Lose!")
    
    return step_number

In [33]:
data = []

for i in range(1000):
    clear_output(wait=True)
    print("{}%\n".format((i + 1) / 10))
    data.append(agent(create_random_policy()))
    
print("Average steps to finish: {}".format(sum(data)/len(data)))

100.0%

Average steps to finish: 22.205


In [34]:
agent(create_random_policy(), verbose=True)

Move: 7 Position: 7 Action: N
-----------------
|   |   |   |+1 |
-----------------
|   |   |   | A |
-----------------
|   |   |   |   |
-----------------

Lose!


7

In [0]:
def create_greedy_policy(V_s):
    s_to_sprime = create_state_to_state_prime_verbose_map()
    policy = {}
        
    for state in range(12):
        
        state_values = {a: V_s[s_to_sprime[state][a]] for a in ['N', 'S', 'E', 'W']}
        
        if state == 3 or state == 7:
            policy[state] = {'N': 0.0, 'E': 0.0, 'S': 0.0, 'W': 0.0}
        else:
            max_actions = [k for k, v in state_values.items() if v == max(state_values.values())]
            policy[state] = {a: 1 / len(max_actions) if a in max_actions else 0.0 for a in ['N', 'S', 'E', 'W']}
    return policy


In [0]:
def iterative_policy_evaluation(policy, theta=0.01, discount_rate=0.5):
    V_s = {i: 0 for i in range(12)} # 1.
    probablitiy_map = create_probability_map() # 2.
    open_positions = [0, 1, 2, 4, 5, 6, 8, 9, 10, 11]
    
    delta = 100 # 3.
    while not delta < theta: # 4.
        delta = 0 # 5.
        for state in open_positions: # 6.
            v = V_s[state] # 7.
            
            total = 0 # 8.
            for action in ["N", "E", "S", "W"]:
                action_total = 0
                for state_prime in range(12):
                    action_total += probablitiy_map[(state_prime, -1, state, action)] * (-1 + discount_rate * V_s[state_prime])
                total += policy[state][action] * action_total   
                
            V_s[state] = round(total, 1) # 9.
            delta = max(delta, abs(v - V_s[state])) # 10.
    return V_s # 11.

In [42]:
policy = create_random_policy()
V_s = iterative_policy_evaluation(policy)
policy = create_greedy_policy(V_s)
print(V_s)

V_s = iterative_policy_evaluation(policy)
policy = create_greedy_policy(V_s)
print(V_s)

{0: -1.9, 1: -1.9, 2: -1.7, 3: 0, 4: -1.9, 5: -1.9, 6: -1.7, 7: 0, 8: -1.9, 9: -1.9, 10: -1.9, 11: -1.6}
{0: -1.9, 1: -1.5, 2: -1.0, 3: 0, 4: -1.9, 5: -1.5, 6: -1.0, 7: 0, 8: -1.9, 9: -1.8, 10: -1.5, 11: -1.0}


In [43]:
data = []

for i in range(1000):
    clear_output(wait=True)
    print("{}%\n".format((i + 1) / 10))
    data.append(agent(policy))
    
print("Average steps to finish: {}".format(sum(data)/len(data)))

100.0%

Average steps to finish: 3.207


In [40]:
agent(policy, verbose=True)

Move: 3 Position: 7 Action: N
-----------------
|   |   |   |+1 |
-----------------
|   |   |   | A |
-----------------
|   |   |   |   |
-----------------

Lose!


3

In [0]:
def value_iteration(V_s, theta=0.01, discount_rate=0.5):
    probablitiy_map = create_probability_map()
    open_positions = [0, 1, 2, 4, 5, 6, 8, 9, 10, 11]
    delta = 100
    while not delta < theta:
        delta = 0
        for state in open_positions:
            v = V_s[state]
            
            totals = {}
            for action in ["N", "S", "E", "W"]:
                total = 0
                for state_prime in range(12):
                    total += probablitiy_map[(state_prime, -1, state, action)] * (-1 + discount_rate * V_s[state_prime])
                totals[action] = total
            
            V_s[state] = round(max(totals.values()), 4)
            delta = max(delta, abs(v - V_s[state]))
    return V_s

In [0]:
V_s = {i: 0 for i in range(12)}
V_s = value_iteration(V_s)
policy = create_greedy_policy(V_s)

print(V_s)

In [0]:
data = []

for i in range(1000):
    clear_output(wait=True)
    print("{}%\n".format((i + 1) / 10))
    data.append(agent(policy))
    
print("Average steps to finish: {}".format(sum(data)/len(data)))

In [0]:
agent(policy, verbose=True)

In [45]:
create_state_to_state_prime_verbose_map()

{0: {'E': 1, 'N': 0, 'S': 4, 'W': 0},
 1: {'E': 2, 'N': 1, 'S': 5, 'W': 0},
 2: {'E': 3, 'N': 2, 'S': 6, 'W': 1},
 3: {'E': 0, 'N': 0, 'S': 0, 'W': 0},
 4: {'E': 5, 'N': 0, 'S': 8, 'W': 4},
 5: {'E': 6, 'N': 1, 'S': 9, 'W': 4},
 6: {'E': 7, 'N': 2, 'S': 10, 'W': 5},
 7: {'E': 0, 'N': 0, 'S': 0, 'W': 0},
 8: {'E': 9, 'N': 4, 'S': 8, 'W': 8},
 9: {'E': 10, 'N': 5, 'S': 9, 'W': 8},
 10: {'E': 11, 'N': 6, 'S': 10, 'W': 9},
 11: {'E': 11, 'N': 7, 'S': 11, 'W': 10}}

In [0]:
create_random_policy()

In [0]:
len(create_probability_map())