# Gridworld with First Visit

In [1]:
import numpy as np

# display output
from random import uniform
import random
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):        # return initial state
        return grid.states[gridSize*gridSize-1]
   
    def transition_reward(self, current_pos, action): # return the transition probability

        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, receive + 10
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, receive + 5
        if current_pos == [0, 3]:
            reward = 5

        # if taking an action crosses the border; agent's new_pos is the same as the current pos
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]

        return self.new_pos, reward

## First-visit MC Control 

In [4]:
# create a grid object
grid = Gridworld(5)

In [5]:
# Initiate a random policy
random_policy = np.random.randint(1000, size=(state_count, action_count))
random_policy = random_policy/random_policy.sum(axis=1)[:,None]
policy = random_policy

In [6]:
# random policy
policy

array([[0.06602564, 0.05961538, 0.24230769, 0.63205128],
       [0.35405941, 0.05821782, 0.23485149, 0.35287129],
       [0.25276823, 0.35700649, 0.35089729, 0.03932799],
       [0.22983479, 0.24344023, 0.08260447, 0.44412051],
       [0.26377377, 0.2559727 , 0.44758654, 0.03266699],
       [0.14536741, 0.07827476, 0.28168264, 0.49467519],
       [0.20148248, 0.32850404, 0.21731806, 0.25269542],
       [0.28804554, 0.28387097, 0.1540797 , 0.2740038 ],
       [0.20271143, 0.21304067, 0.17366043, 0.41058748],
       [0.05172414, 0.42917055, 0.17800559, 0.34109972],
       [0.26844189, 0.29399064, 0.27491904, 0.16264843],
       [0.3783434 , 0.11949957, 0.3852459 , 0.11691113],
       [0.32282282, 0.1026026 , 0.44694695, 0.12762763],
       [0.0994832 , 0.53617571, 0.28036176, 0.08397933],
       [0.27583262, 0.27198975, 0.20239112, 0.24978651],
       [0.21509777, 0.42064575, 0.26512051, 0.09913597],
       [0.39483204, 0.18346253, 0.34677003, 0.0749354 ],
       [0.19420671, 0.14746544,

### Create an Episode following Policy

In [7]:
def generate_episode(steps):

    # set initial state
    state_vector = grid.initial_state()

    # initialize state (with iniitial state), action list and reward list
    state_list = [state_vector]
    action_list = []
    reward_list = []

    # generate an episode
    for i in range(steps):

        # pick an action based on categorical distribution in policy
        action_index = int(np.random.choice(action_count, 1, p=policy[grid.states.index(state_vector)])) 
        action_vector = actions[action_index] # convert the integer index (ie. 0) to action (ie. [-1, 0])

        # get new state and reward after taking action from current state
        new_state_vector, reward = grid.transition_reward(state_vector, action_vector)
        state_vector = list(new_state_vector)

        # save state, action chosen and reward to list
        state_list.append(state_vector)
        action_list.append(action_vector)
        reward_list.append(reward)
        
    return state_list, action_list, reward_list

### First Visit MC

In [8]:
# initialize q values for all state action pairs
Q_values = np.zeros((state_count, action_count))

In [9]:
# intialize parameters
gamma = 0.99
epsilon = 0.2

In [10]:
# define average function
def Average(lst): 
    return sum(lst) / len(lst) 

In [11]:
# iterate 500 times: each time, generating an episode of 200 steps
max_steps = 200

# define variables for keeping track of time steps
Terminal = max_steps
t_list=[]
for i in range(1,max_steps+1):
    t = Terminal - i
    t_list.append(t)

In [12]:
# iteration 500 times
for iteration in range(500):
  
    # generate an episode of specified step count
    state_list, action_list, reward_list = generate_episode(max_steps)
    
#     print("state_list: ", state_list)
#     print("action_list: ", action_list)
#     print("reward_list: ", reward_list)
    
    # intialize G
    G = 0
    
    # initiate returns and visited list to none
    returns_list = []
    visited_list = []

    # loop for each step of episode: T-1, T-2, T-3 ... 0 = 199, 198, 197 ... 0
    for t in t_list:

        # calculate G: starting with the last reward at index t (naturally accounts for pseudocode's "t-1")
        G = gamma*G + reward_list[t]
        # print(G)
        
        # combine state action pair, for example, state = [0,0], action = [0,1], state_action_pair = [0,0,0,1]
        state_action_pair = []
        state_action_pair.extend(state_list[t])
        state_action_pair.extend(action_list[t])
        # print(state_action_pair)

        # check if state action pair have been visited before (if not: continue, else: move to the next time step)
        if state_action_pair not in visited_list:

            # add state action pair to visited list
            visited_list.append(state_action_pair)

            # append G to returns
            returns_list.append(G)

            # find state and action index, for example, converting action [-1, 0] to 0, and same for state #
            state_index = grid.states.index(state_list[t])
            action_index = actions.index(action_list[t])
    #         print("state_index: ", state_index)

            # write Q_values to the state-action pair
            Q_values[state_index][action_index] = Average(returns_list)
    #         print("average return: ", Average(returns_list))

            # choose best action at given state
            choose_action = np.argmax(Q_values[state_index])

            # overwrite policy
            for a in range(action_count): # for action in actions [0, 1, 2, 3]
                if choose_action == a: # if the choose_action is the same as the current action
                    policy[state_index][a] = 1 - epsilon + epsilon/action_count 
                else: # if choose_action is not the same as the current action
                    policy[state_index][a] = epsilon/action_count

In [13]:
# total unique state action pairs at the end of one episode
print(len(visited_list))

26


In [14]:
np.set_printoptions(precision=2)
Q_values

array([[ -1.4 ,   3.57,  -0.4 ,  -0.66],
       [-11.3 ,  -8.21,  -5.18, -11.79],
       [-38.9 ,  -3.53,  -6.33,  -7.78],
       [ -1.08,   1.  ,   0.42,  -4.35],
       [  1.73,   4.33,   0.  ,   3.84],
       [ -3.27, -18.19,   0.  , -18.56],
       [-10.78,  -8.94, -17.97, -18.37],
       [ -7.37,   2.81, -16.05,  -7.57],
       [ -0.73,   1.45,  -1.34,   3.25],
       [  4.57,   2.07,   0.  ,   1.  ],
       [-18.74,  -0.33,  -2.86,  -0.25],
       [ -9.82,   1.37, -17.71,  -4.65],
       [-16.59,   1.91, -15.4 ,   1.23],
       [ -1.51,  -1.11,  -0.26,   2.32],
       [  0.  , -12.67,  -0.79,  -0.08],
       [-18.9 , -25.94,   0.  ,  -9.35],
       [-10.86, -17.4 , -10.58,  -8.61],
       [-17.03, -14.59,  -5.24,  -6.03],
       [ -0.47,  -9.9 ,  -8.99, -24.85],
       [  0.  ,  -0.25,  -8.95, -11.4 ],
       [-19.03, -19.42, -19.18, -20.22],
       [-11.41, -11.71, -11.89, -19.31],
       [-10.73, -27.16, -22.45, -21.37],
       [  4.7 ,   0.78,   4.78, -23.61],
       [ -8.93, 

In [15]:
#up, right, down, left = (clockwise from up) 
policy

array([[0.05, 0.85, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.05, 0.05, 0.85],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.05, 0.85],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.85, 0.05, 0.05],
       [0.05, 0.05, 0.05, 0.85],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.05, 0.05, 0.05, 0.85],
       [0.05, 0.05, 0.85, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.85, 0.05, 0.05, 0.05],
       [0.05, 0.05, 0.85, 0.05],
       [0.05, 0.05, 0.05, 0.85]])

## Visualize 

In [16]:
# PRINT POLICY TABLE ################################################################################
# import pandas library
import pandas as pd
# define column and index
columns=range(grid.size)
index = range(grid.size)
# define dataframe to represent policy table
policy_table = pd.DataFrame(index = index, columns=columns)

# iterate through policy to make a table that represents action number
# as action name (eg. left, right, up, down)
for state in range(len(policy)):
    
    # find the best action at each state
    best_action = np.argmax(policy[state])

    # get action name
    if best_action == 0:
        action_name = 'up'
    elif best_action == 1:
        action_name = 'right'
    elif best_action == 2:
        action_name = 'down'
    else:
        action_name = 'left'

    # calculate the row and column coordinate of the current state number
    row = int(state/grid.size)
    column = round((state/grid.size - int(state/grid.size))*grid.size)
            
    # assign action name
    policy_table.loc[row][column] = action_name

print("Policy Table: ")
print(policy_table)
print()

Policy Table: 
       0      1      2      3      4
0  right   down  right  right  right
1   down  right  right   left     up
2   left  right  right   left     up
3   down   left   down     up     up
4     up     up     up   down   left

