# Since the prince has to first get the key, we will divide the task into two parts.

Environment 1 will have the task of going from start to key. And environment 2 will go from key to princess.

![title](Grid_World_Key.png)

 At any moment, Agent can take four actions - Up(^), Left(<), Down(V),right(>). <br> 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from gridWorldGameStart2Key import standard_gridS2K, negative_gridS2K, print_values, print_policy
from gridWorldGameKey2Goal import standard_grid, negative_grid, print_values, print_policy

SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
ALPHA = 0.1

In [4]:
def random_action(a, eps=0.1):
  # epsilon-soft to ensure all states are visited
  p = np.random.random()
  if p < (1 - eps):
    return a
  else:
    return np.random.choice(ALL_POSSIBLE_ACTIONS)

In [5]:
def play_gameS2K(g, p):
  # Define start point
  s = (3, 0)
  g.set_state(s)
  states_and_rewards_S2K = [(s, 0)] # (state, reward) tuples
  while not g.game_over():
    a = p[s]
    a = random_action(a)
    r = g.move(a)
    s = g.current_state()
    states_and_rewards_S2K.append((s, r))
  return states_and_rewards_S2K

In [6]:
def play_gameK2G(g, p):
  # Define start point
  s = (4, 3)
  g.set_state(s)
  states_and_rewards_K2G = [(s, 0)] # list of tuples of (state, reward)
  while not g.game_over():
    a = p[s]
    a = random_action(a)
    r = g.move(a)
    s = g.current_state()
    states_and_rewards_K2G.append((s, r))
  return states_and_rewards_K2G

In [7]:
gridS2K = standard_gridS2K()

# print rewards
print("rewards:")
print_values(gridS2K.rewards, gridS2K)

rewards:
------------------------------------------
 0.00| 0.00|-1.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00|-1.00| 0.00| 0.00| 0.00|-1.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 1.00| 0.00| 0.00| 0.00|
------------------------------------------
-1.00| 0.00| 0.00| 0.00| 0.00| 0.00|-1.00|


In [8]:
gridK2G = standard_grid()

# print rewards
print("rewards:")
print_values(gridK2G.rewards, gridK2G)

rewards:
------------------------------------------
 0.00| 0.00|-1.00| 0.00| 0.00| 0.00| 1.00|
------------------------------------------
 0.00| 0.00|-1.00| 0.00| 0.00| 0.00|-1.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
-1.00| 0.00| 0.00| 0.00| 0.00| 0.00|-1.00|


In [9]:
# state -> action
policyS2K = {
    (0, 0): 'D',
    (0, 1): 'L',
    (0, 3): 'D',
    (0, 5): 'R',
    (0, 6): 'D',
    
    (1, 0): 'U',
    (1, 3): 'D',
    (1, 5): 'U',

    
    (2, 0): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'R',
    (2, 4): 'R',
    (2, 5): 'U',
    (2, 6): 'U',
    
    (3, 0): 'D',
    (3, 1): 'U',
    (3, 2): 'U',
    (3, 4): 'U',
    (3, 5): 'U',
    (3, 6): 'U',
    
    (4, 0): 'R',
    (4, 1): 'R',
    (4, 2): 'R',
    (4, 4): 'U',
    (4, 6): 'U',
    
    (5, 1): 'U',
    (5, 3): 'U',
    (5, 4): 'U',
    (5, 5): 'L',
    
}
# initial policy
print("initial policy for S2K:")
print_policy(policyS2K, gridS2K)

initial policy for S2K:
------------------------------------------
  D  |  L  |     |  D  |     |  R  |  D  |
------------------------------------------
  U  |     |     |  D  |     |  U  |     |
------------------------------------------
  R  |  R  |  R  |  R  |  R  |  U  |  U  |
------------------------------------------
  D  |  U  |  U  |     |  U  |  U  |  U  |
------------------------------------------
  R  |  R  |  R  |     |  U  |     |  U  |
------------------------------------------
     |  U  |     |  U  |  U  |  L  |     |


In [25]:
#state -> action
policyK2G = {
    (0, 0): 'D',
    (0, 1): 'L',
    (0, 3): 'D',
    (0, 5): 'R',
    
    (1, 0): 'U',
    (1, 3): 'D',
    (1, 5): 'U',

    
    (2, 0): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'R',
    (2, 4): 'R',
    (2, 5): 'U',
    (2, 6): 'U',
    
    (3, 0): 'U',
    (3, 1): 'U',
    (3, 2): 'U',
    (3, 4): 'U',
    (3, 5): 'U',
    (3, 6): 'U',
    
    (4, 0): 'R',
    (4, 1): 'R',
    (4, 2): 'U',
    (4, 3): 'R',
    (4, 4): 'U',
    (4, 6): 'U',
    
    (5, 1): 'U',
    (5, 3): 'U',
    (5, 4): 'U',
    (5, 5): 'L',
    
}
# initial policy
print("initial policy for K2G:")
print_policy(policyK2G, gridK2G)

initial policy for K2G:
------------------------------------------
  D  |  L  |     |  D  |     |  R  |     |
------------------------------------------
  U  |     |     |  D  |     |  U  |     |
------------------------------------------
  R  |  R  |  R  |  R  |  R  |  U  |  U  |
------------------------------------------
  U  |  U  |  U  |     |  U  |  U  |  U  |
------------------------------------------
  R  |  R  |  U  |  R  |  U  |     |  U  |
------------------------------------------
     |  U  |     |  U  |  U  |  L  |     |


In [11]:
# initialize values for S2K
V = {}
states = gridS2K.all_states()
for s in states:
  V[s] = 0
  
# initial value for all states in grid
print_values(V, gridS2K)

------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|


In [12]:
# initialize values for K2G
V = {}
states = gridK2G.all_states()
for s in states:
  V[s] = 0
  
# initial value for all states in grid
print_values(V, gridK2G)

------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00|


In [14]:
for it in range(100):
  states_and_rewards_S2K = play_gameS2K(gridS2K, policyS2K)
  for t in range(len(states_and_rewards_S2K) - 1):
    s, _ = states_and_rewards_S2K[t]
    s2, r = states_and_rewards_S2K[t+1]
    # Updating values
    V[s] = V[s] + ALPHA*(r + GAMMA*V[s2] - V[s])

In [15]:
print("final values:")
print_values(V, gridS2K)
print("final policy:")
print_policy(policyS2K, gridS2K)

final values:
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00|-0.52|-0.86|
------------------------------------------
 0.00| 0.00| 0.00| 0.00| 0.00|-0.31| 0.00|
------------------------------------------
-0.00|-0.00|-0.00|-0.01|-0.03|-0.10| 0.00|
------------------------------------------
 0.67|-0.00|-0.00| 0.00|-0.00| 0.00| 0.00|
------------------------------------------
 0.75| 0.84| 0.94| 0.00| 0.00| 0.00| 0.00|
------------------------------------------
 0.00| 0.17| 0.00| 0.00| 0.00| 0.00| 0.00|
final policy:
------------------------------------------
  D  |  L  |     |  D  |     |  R  |  D  |
------------------------------------------
  U  |     |     |  D  |     |  U  |     |
------------------------------------------
  R  |  R  |  R  |  R  |  R  |  U  |  U  |
------------------------------------------
  D  |  U  |  U  |     |  U  |  U  |  U  |
------------------------------------------
  R  |  R  |  R  |     |  U  |     |  U  |
--------------------------

In [16]:
states_and_rewards_S2K

[((3, 0), 0), ((4, 0), 0), ((4, 1), 0), ((4, 2), 0), ((4, 2), 0), ((4, 3), 1)]

In [21]:
F_P_Path_S2K = {}
for (k,v) in states_and_rewards_S2K:
    if v == 0:
        f_a = policyS2K[k]
        F_P_Path_S2K[k] = f_a

In [22]:
print("Path from Start to Key ")
print_policy(F_P_Path_S2K,gridS2K)

Path from Start to Key 
------------------------------------------
     |     |     |     |     |     |     |
------------------------------------------
     |     |     |     |     |     |     |
------------------------------------------
     |     |     |     |     |     |     |
------------------------------------------
  D  |     |     |     |     |     |     |
------------------------------------------
  R  |  R  |  R  |     |     |     |     |
------------------------------------------
     |     |     |     |     |     |     |


In [26]:
for it in range(100):
    states_and_rewards_K2G = play_gameK2G(gridK2G, policyK2G)
    for t in range(len(states_and_rewards_K2G) - 1):
        s, _ = states_and_rewards_K2G[t]
        s2, r = states_and_rewards_K2G[t+1]
        # Updating values
        V[s] = V[s] + ALPHA*(r + GAMMA*V[s2] - V[s])

In [27]:
states_and_rewards_K2G

[((4, 3), 0),
 ((4, 4), 0),
 ((3, 4), 0),
 ((2, 4), 0),
 ((2, 5), 0),
 ((1, 5), 0),
 ((0, 5), 0),
 ((0, 6), 1)]

In [28]:
F_P_Path_K2G = {}
for (k,v) in states_and_rewards_K2G:
    if v == 0:
        f_a = policyK2G[k]
        F_P_Path_K2G[k] = f_a

In [29]:
print("Path from Key To Goal")
print_policy(F_P_Path_K2G,gridK2G)

Path from Key To Goal
------------------------------------------
     |     |     |     |     |  R  |     |
------------------------------------------
     |     |     |     |     |  U  |     |
------------------------------------------
     |     |     |     |  R  |  U  |     |
------------------------------------------
     |     |     |     |  U  |     |     |
------------------------------------------
     |     |     |  R  |  U  |     |     |
------------------------------------------
     |     |     |     |     |     |     |


In [30]:
#MergePath from S2K K2G
def Merge(dict1, dict2): 
    res = {**dict1, **dict2} 
    return res 

F_P_Path = Merge(F_P_Path_S2K, F_P_Path_K2G)

In [31]:
print("Final Path from Start to Princess")
print_policy(F_P_Path,gridK2G)

Final Path from Start to Princess
------------------------------------------
     |     |     |     |     |  R  |     |
------------------------------------------
     |     |     |     |     |  U  |     |
------------------------------------------
     |     |     |     |  R  |  U  |     |
------------------------------------------
  D  |     |     |     |  U  |     |     |
------------------------------------------
  R  |  R  |  R  |  R  |  U  |     |     |
------------------------------------------
     |     |     |     |     |     |     |
