In [1]:
import copy
import numpy as np

### Value Iteration Grid World

We want to move to top left or bottom right. Possible actions are left/right/up/botton.

In [2]:
val = [[-1 for _ in range(4)] for _ in range(4)]
val[0][0] = 0
val[3][3] = 0

discount = 1
num_episodes = 10

final_states = set([(0,0), (3,3)])

def get_neighbour(i,j):
    ans = []
    candidates = [(i-1,j), (i+1, j), (i, j-1), (i, j+1)]
    for x,y in candidates:
        if 0<=x<=3 and 0<=y<=3:
            ans.append((x,y))
    return ans

for _ in range(num_episodes):
    backup = copy.deepcopy(val)
    for i in range(4):
        for j in range(4):
            if (i,j) in final_states:
                continue
            dist = -10**8
            ans = []
            for x,y in get_neighbour(i,j):
                if backup[x][y] == dist:
                    ans.append((x,y))
                elif backup[x][y] > dist:
                    dist = val[x][y]
                    ans = []
                    ans.append((x,y))
            index = np.random.randint(0, len(ans)) 
            x,y = ans[index] # This is next state
            reward = -1 
            val[i][j] = reward + discount * backup[x][y]

In [3]:
for x in val:
    print (x)

[0, -1, -2, -3]
[-1, -2, -3, -2]
[-2, -3, -2, -1]
[-3, -2, -1, 0]


### Q Learning

We want to populate the q table. We will store dictionary in each cell, key is action, val is q-val.

update rule
Take action randomly
update Q(s,a) = reward + Q(s', a)

In [4]:
def get_valid_actions(i,j):
    left, right = (i, j-1), (i, j+1)
    up, down = (i-1, j), (i+1, j)
    dic = {'left':left, 'right':right, 'up':up, 'down':down}
    actions = []
    for key in dic:
        x,y = dic[key]
        if 0<=x<=3 and 0<=y<=3:
            actions.append(key)
    return actions

In [5]:
q = [[{} for _ in range(4)] for _ in range(4)]

for i in range(4):
    for j in range(4):
        if (i,j) in final_states:
            continue
        actions = get_valid_actions(i,j)
        for a in actions : q[i][j][a] = 0

q[0][0] = {'None':0}
q[3][3] = {'None':0}

In [7]:
# Populate valid actions, with val = 0
        
num_plays = 80
final_states = set([(0,0), (3,3)])

def pretty_print(arr):
    for x in arr:
        print (x)
    print ("\n")
    
def get_next_state(i,j,action):
    left, right = (i, j-1), (i, j+1)
    up, down = (i-1, j), (i+1, j)
    dic = {'left':left, 'right':right, 'up':up, 'down':down}
    return dic[action]

for _ in range(num_plays):
    backup_q = copy.deepcopy(q)
#     pretty_print(backup_q)
    for i in range(4):
        for j in range(4):
            if (i,j) in final_states:
                continue
            
            best_val = -10**8
            # Find list of max value action
            dicc = q[i][j]
            actions_list = []
            for key in dicc:
                val = dicc[key]
                if val > best_val:
                    best_val = val
                    actions_list = [(key, val)]
                elif val == best_val:
                    actions_list.append((key, val))
            
            # Select action from list
            index = np.random.randint(0,len(actions_list)) if actions_list else 0
            action, val = actions_list[index]
            
            # Find reward for action 
            reward = -1
            # Update q table
            x,y = get_next_state(i,j, action)
            val_ns = max(backup_q[x][y].values())
            dicc[action] = reward + val_ns

In [8]:
q

[[{'None': 0},
  {'left': -1, 'right': -2, 'down': -2},
  {'left': -2, 'right': -3, 'down': -3},
  {'left': -3, 'down': -3}],
 [{'right': -2, 'up': -1, 'down': -2},
  {'left': -2, 'right': -3, 'up': -2, 'down': -3},
  {'left': -3, 'right': -3, 'up': -3, 'down': -3},
  {'left': -3, 'up': -4, 'down': -2}],
 [{'right': -3, 'up': -2, 'down': -3},
  {'left': -3, 'right': -3, 'up': -3, 'down': -3},
  {'left': -3, 'right': -2, 'up': -3, 'down': -2},
  {'left': -2, 'up': -2, 'down': -1}],
 [{'right': -3, 'up': -3},
  {'left': -3, 'right': -2, 'up': -3},
  {'left': -2, 'right': -1, 'up': -2},
  {'None': 0}]]

In [9]:
ans = [[None for _ in range(4)] for _ in range(4)]

for i in range(4):
    for j in range(4):
        dicc = q[i][j]
        actions_list = []
        best_val = -10**8
        for key in dicc:
            val = dicc[key]
            if val > best_val:
                best_val = val
                actions_list = [key]
            elif val == best_val:
                actions_list.append(key)
        ans_actions = '/'.join(actions_list)
        ans[i][j] = ans_actions

In [10]:
ans

[['None', 'left', 'left', 'left/down'],
 ['up', 'left/up', 'left/right/up/down', 'down'],
 ['up', 'left/right/up/down', 'right/down', 'down'],
 ['right/up', 'right', 'right', 'None']]