In [1]:
import pickle
import numpy as np

In [2]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):        # return initial state
        return grid.states[gridSize*gridSize-1]
   
    def transition_reward(self, current_pos, action): # return the transition probability

        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, receive + 10
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, receive + 5
        if current_pos == [0, 3]:
            reward = 5

        # if taking an action crosses the border; agent's new_pos is the same as the current pos
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]

        return self.new_pos, reward

In [3]:
grid = Gridworld(5)

In [4]:
with open('MC_Qvalues_0.1.pkl', 'rb') as file:
    q_values = pickle.load(file)

In [5]:
len(q_values)

20

In [6]:
run = 0
q_values[run]

array([[ -5.19535238,  -4.57143619,  -3.3967326 ,  -2.4701995 ],
       [ -3.90827205, -36.82110202,  -4.43694344, -21.31491017],
       [-16.95657059, -31.45208528, -18.38863922, -29.10518038],
       [ -2.52699927,  -1.18331977,  -1.67602648,  -0.61713134],
       [-20.85596116,  -5.14109566,   2.07023041, -16.85114546],
       [ -6.06385245, -21.78217684, -10.71999121, -20.41932592],
       [  6.63333333,  -1.95212333,   0.        ,  -6.78672378],
       [-31.28784792,  -0.0933968 , -26.52182377, -40.65084391],
       [ -0.52726906, -14.45896016,  -3.61104835,   0.21354628],
       [  0.81737909,  -1.        ,  -1.33977826,   3.61463694],
       [ -9.38571071,  -9.19274093, -21.47501332,  -7.83513613],
       [ -2.18771302,  -2.98142154,  -2.56609424, -26.25963018],
       [-41.3005006 ,  -8.32717664,  -8.84279387,  -2.69641109],
       [  0.44118459,  -2.283798  ,  -2.37181332,  -3.60239549],
       [ -1.9701995 ,  -1.97349967,  -5.24473063,  -8.9915337 ],
       [-11.55625544,  -3

In [7]:
# PRINT POLICY TABLE ################################################################################
# import pandas library
import pandas as pd
# define column and index
columns=range(grid.size)
index = range(grid.size)
# define dataframe to represent policy table
policy_table = pd.DataFrame(index = index, columns=columns)

# iterate through policy to make a table that represents action number
# as action name (eg. left, right, up, down)
for state in range(len(q_values[run])):
    
    # find the best action at each state
    best_action = np.argmax(q_values[run][state])

    # get action name
    if best_action == 0:
        action_name = 'up'
    elif best_action == 1:
        action_name = 'right'
    elif best_action == 2:
        action_name = 'down'
    else:
        action_name = 'left'

    # calculate the row and column coordinate of the current state number
    row = int(state/grid.size)
    column = round((state/grid.size - int(state/grid.size))*grid.size)
            
    # assign action name
    policy_table.loc[row][column] = action_name

print("Policy Table: ")
print(policy_table)
print()

Policy Table: 
       0     1      2     3      4
0   left    up     up  left   down
1     up    up  right  left   left
2   left    up   left    up     up
3  right  down     up  left  right
4     up  down   left    up   left

