# Gridworld with First Visit

In [1]:
# Find the value function of policy
import numpy as np

# display output
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.pos_check = [0, 0] # a copy of new position
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):
        # return initial state
        return grid.states[gridSize*gridSize-1]
       
    def reward(self, current_pos, action):
        # return the reward        
        
        # take action in current pos
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            reward = 5
        return reward
    
    def p_transition(self, current_pos, action):
        # return the transition probability
        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)
        self.pos_check = self.new_pos # make a copy of new pos before being overwritten below

        # if taking an action crosses the border = agent stays in same position
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]
        return self.new_pos

In [4]:
# create a grid object
grid = Gridworld(5)

In [6]:
# get initial state (bottom right)
grid.initial_state()

[4, 4]

## First-visit MC Control 

In [7]:
# Initiate a random policy
random_policy = np.random.randint(1000, size=(state_count, action_count))
random_policy = random_policy/random_policy.sum(axis=1)[:,None]
policy = random_policy

In [8]:
# random policy
policy

array([[0.15957447, 0.17021277, 0.11930091, 0.55091185],
       [0.21981308, 0.2235514 , 0.28635514, 0.27028037],
       [0.27961433, 0.16115702, 0.30440771, 0.25482094],
       [0.14693535, 0.39588581, 0.35138539, 0.10579345],
       [0.31801471, 0.24522059, 0.14632353, 0.29044118],
       [0.33765112, 0.42141623, 0.19905009, 0.04188256],
       [0.18684461, 0.1115348 , 0.25357483, 0.44804576],
       [0.41468927, 0.05875706, 0.39774011, 0.12881356],
       [0.23833333, 0.325     , 0.155     , 0.28166667],
       [0.60904449, 0.03792852, 0.34573304, 0.00729395],
       [0.1125    , 0.19411765, 0.38823529, 0.30514706],
       [0.17424564, 0.36464088, 0.16574586, 0.29536762],
       [0.02671756, 0.18129771, 0.59160305, 0.20038168],
       [0.08573718, 0.08092949, 0.44551282, 0.38782051],
       [0.16755725, 0.29885496, 0.21870229, 0.3148855 ],
       [0.31588367, 0.19105145, 0.10380313, 0.38926174],
       [0.28959576, 0.22597747, 0.03843605, 0.44599072],
       [0.27588235, 0.19823529,

### Create an Episode following policy

In [9]:
# set initial state
state = grid.initial_state()

# initialize state (with iniitial state), action list
state_list = [state]
action_list = []

# generate an episode
for i in range(10):
    
    # pick an action based on categorical distribution in policy
    action = int(np.random.choice(action_count, 1, p=policy[grid.states.index(state)]))
    action = actions[action]
    
    # get the new state with the chosen action
    new_state = list(grid.p_transition(state, action))
    state = new_state
    
    # save state and action to list
    state_list.append(state)
    action_list.append(action)

In [10]:
state_list

[[4, 4],
 [4, 4],
 [4, 4],
 [4, 4],
 [4, 4],
 [3, 4],
 [3, 3],
 [3, 2],
 [2, 2],
 [2, 1],
 [2, 2]]

In [11]:
action_list

[[0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [-1, 0],
 [0, -1],
 [0, -1],
 [-1, 0],
 [0, -1],
 [0, 1]]

### Loop for each steo of episode 

In [16]:
G = 0
gamma = 0.99

In [13]:
state_list[-1]

[2, 2]

In [14]:
action_list[-1]

[0, 1]

In [17]:
gamma*G + grid.reward(state_list[-1], action_list[-1])

0.0