# Gridworld with First Visit

In [1]:
# Find the value function of policy
import numpy as np

# display output
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.pos_check = [0, 0] # a copy of new position
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):
        # return initial state
        return grid.states[gridSize*gridSize-1]
       
    def reward(self, current_pos, action):
        # return the reward        
        
        # take action in current pos
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            reward = 5
        return reward
    
    def p_transition(self, current_pos, action):
        # return the transition probability
        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)
        self.pos_check = self.new_pos # make a copy of new pos before being overwritten below

        # if taking an action crosses the border = agent stays in same position
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]
        return self.new_pos

In [4]:
# create a grid object
grid = Gridworld(5)

In [5]:
# get initial state (bottom right)
grid.initial_state()

[4, 4]

## First-visit MC Control 

In [6]:
# Initiate a random policy
random_policy = np.random.randint(1000, size=(state_count, action_count))
random_policy = random_policy/random_policy.sum(axis=1)[:,None]
policy = random_policy

In [7]:
# random policy
policy

array([[0.1451049 , 0.04807692, 0.53146853, 0.27534965],
       [0.32613636, 0.22575758, 0.12878788, 0.31931818],
       [0.23265306, 0.16479592, 0.37959184, 0.22295918],
       [0.145469  , 0.25516693, 0.25596184, 0.34340223],
       [0.05632653, 0.25061224, 0.68571429, 0.00734694],
       [0.32350187, 0.25      , 0.41058052, 0.0159176 ],
       [0.25263852, 0.06134565, 0.32915567, 0.35686016],
       [0.12819203, 0.41317671, 0.23493361, 0.22369765],
       [0.35714286, 0.15648496, 0.05451128, 0.4318609 ],
       [0.48404516, 0.17967599, 0.32596956, 0.01030928],
       [0.47902098, 0.04079254, 0.1013986 , 0.37878788],
       [0.03928571, 0.56785714, 0.28785714, 0.105     ],
       [0.37091264, 0.19521718, 0.17520742, 0.25866276],
       [0.13180304, 0.21436505, 0.30766383, 0.34616809],
       [0.10808709, 0.38646967, 0.26749611, 0.23794712],
       [0.42572883, 0.3465988 , 0.15363258, 0.0740398 ],
       [0.38723404, 0.22978723, 0.01702128, 0.36595745],
       [0.27117327, 0.14141414,

### Create an Episode following policy

In [8]:
def generate_episode(steps):

    # set initial state
    state = grid.initial_state()

    # initialize state (with iniitial state), action list
    state_list = [state]
    action_list = []
    reward_list = []

    # generate an episode
    for i in range(200):

        # pick an action based on categorical distribution in policy
        action = int(np.random.choice(action_count, 1, p=policy[grid.states.index(state)]))
        action = actions[action]

        # get reward
        reward = grid.reward(state, action)
    #     print(state)
    #     print(action)
    #     print(reward)

        # get the new state with the chosen action
        new_state = list(grid.p_transition(state, action))
        state = new_state

        # save state and action to list
        state_list.append(state)
        action_list.append(action)

        # save reward to list
        reward_list.append(reward)
        
    return state_list, action_list, reward_list

### Initialize

In [9]:
# initialize q values for all state action pairs
Q_values = np.zeros((state_count, action_count))

### Loop

### Loop for each steo of episode 

In [10]:
# intialize parameters
G = 0
gamma = 0.99
epsilon = 0.2

In [11]:
# define average function
def Average(lst): 
    return sum(lst) / len(lst) 

In [12]:
for episode in range(500):
  
    # generate an episode of specified step count
    state_list, action_list, reward_list = generate_episode(200)
    
    # define variables for keeping track of time steps
    Terminal = len(action_list)
    t_list=[]
    for i in range(1,Terminal+1):
        t = Terminal - i
        t_list.append(t)
    
    # define lists
    returns_list = []
    visited_list = []

    # loop for each step of episode
    for t in t_list:

        # add to G
        G = gamma*G + reward_list[t]

        # combine state action pair
        visited = []
        visited.extend(state_list[t])
        visited.extend(action_list[t])

        # check if state action pair have been visited before
        if visited not in visited_list:

            # add state action pair to visited list
            visited_list.append(visited)

            # append G to returns
            returns_list.append(G)

            # find state and action index
            state_index = grid.states.index(state_list[t])
            action_index = actions.index(action_list[t])
    #         print("state_index: ", state_index)

            # write Q_values to the state-action pair
            Q_values[state_index][action_index] = Average(returns_list)
    #         print("average return: ", Average(returns_list))

            # choose best action at given state
            choose_action = np.argmax(Q_values[state_index])

            # overwrite policy
            for i in range(action_count):
                if choose_action == i:
                    policy[state_index][i] = 1 - epsilon + epsilon/action_count
                else:
                    policy[state_index][i] = epsilon/action_count

In [13]:
Q_values

array([[-44.84718765, -51.10529407, -45.934096  , -50.16334669],
       [-52.24112607, -47.38236652, -30.02940041,   1.30508548],
       [-33.14474249, -59.7146156 , -38.16241718, -46.58664535],
       [-69.09657523, -60.52054487, -56.2764614 , -60.62330343],
       [-59.75060799, -46.08096515, -61.52929634, -59.87735349],
       [-49.39363481, -27.54995697, -46.5143136 , -46.52596221],
       [-61.9800096 , -55.02894877, -54.1934294 , -48.69221377],
       [-45.84932055, -62.56751375, -38.85085286, -61.3793388 ],
       [-60.42276026, -62.13663194, -62.45739602, -60.81496602],
       [-60.23691834, -60.52994436, -62.04584429, -62.17108951],
       [-26.53198205, -49.84761535, -33.82424905, -32.3266496 ],
       [ -1.01260864, -63.14489186, -34.6419286 , -33.76160296],
       [-45.16290856, -56.53106816, -50.08998898, -63.27936744],
       [-62.16340003, -61.29547854, -61.26852713, -56.53633511],
       [-61.08685018, -59.68676363, -61.18616644, -61.29383676],
       [-17.56496536, -49

In [14]:
#up, right, down, left = (clockwise from up) 
policy

array([[0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.05, 0.85],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.05, 0.05, 0.85],
       [0.05, 0.05, 0.85, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.05, 0.85],
       [0.05, 0.85, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.05, 0.05, 0.05, 0.85]])

## Visualize 

In [15]:
# PRINT POLICY TABLE ################################################################################
# import pandas library
import pandas as pd
# define column and index
columns=range(grid.size)
index = range(grid.size)
# define dataframe to represent policy table
policy_table = pd.DataFrame(index = index, columns=columns)

# iterate through policy to make a table that represents action number
# as action name (eg. left, right, up, down)
for state in range(len(policy)):
    
    # find the best action at each state
    best_action = np.argmax(policy[state])

    # get action name
    if best_action == 0:
        action_name = 'up'
    elif best_action == 1:
        action_name = 'right'
    elif best_action == 2:
        action_name = 'down'
    else:
        action_name = 'left'

    # calculate the row and column coordinate of the current state number
    row = int(state/grid.size)
    column = round((state/grid.size - int(state/grid.size))*grid.size)
            
    # assign action name
    policy_table.loc[row][column] = action_name

print("Policy Table: ")
print(policy_table)
print()

Policy Table: 
       0     1      2     3      4
0     up  left     up  down  right
1  right  left   down    up     up
2     up    up     up  left  right
3     up    up  right  down     up
4   down  down     up  down   left



# Testing 