In [1]:
import numpy as np

# display output
from random import uniform
import random
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):        # return initial state
        return grid.states[gridSize*gridSize-1]
   
    def transition_reward(self, current_pos, action): # return the transition probability

        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, receive + 10
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, receive + 5
        if current_pos == [0, 3]:
            reward = 5

        # if taking an action crosses the border; agent's new_pos is the same as the current pos
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]

        return self.new_pos, reward

In [4]:
#initialization of Grid
grid = Gridworld(5)
#Q matrix of zeros
Q_values = np.zeros((grid.size*grid.size, len(actions)))
# initialize other parameters
epsilon = 0.2
lr = 0.1
gamma = 0.99

In [5]:
def choose_action(state):
    # choose an action type: explore or exploit
    action_type = int(np.random.choice(2, 1, p=[(len(actions)-1)*(epsilon/len(actions)),1-(len(actions)-1)*(epsilon/len(actions))]))
    
    best_action_index = np.argmax(Q_values[state]) # find best action based on Q values
    
    action_index = best_action_index #assigns best action index to the action, this will hold only if action type is exploit

    if action_type == 0: #if action type is explore, then choses a different action than argmax
        while action_index == best_action_index: 
                action_index = int(np.random.choice(4,1))
    return action_index

In [6]:
for episode in range(500):

    # initialize state (output: [4, 4])
    state = grid.initial_state()

    # iterate over 200 steps within each episode
    for step in range(200):

        # get state index (output: 24)
        state_index = grid.states.index(state)

        # choose an action based on epsilon-greedy (output: action index ie. 0)
        action_index = choose_action(state_index)
        action_vector = actions[action_index] # convert action_index (0) to action_vector ([-1, 0])

        # get the next state and reward after taking the chosen action in the current state
        next_state_vector, reward = grid.transition_reward(state, action_vector)
        next_state_index = grid.states.index(list(next_state_vector))
        next_action_index = choose_action(next_state_index)

        # update Q value
        Q_values[state_index][action_index] = Q_values[state_index][action_index] + lr*(reward + gamma*Q_values[next_state_index][next_action_index]-Q_values[state_index][action_index])

        # set the next state as the current state
        state = list(next_state_vector)
print('training finished')
print(Q_values)

training finished
[[ 2.56624924e-01  2.23710608e+01  0.00000000e+00  1.89866306e+00]
 [ 5.82310328e+01  1.78448708e+01  1.63911635e+01  1.99546012e+01]
 [ 4.64254626e+01  1.02904352e+02  6.34955760e+01  3.32206281e+01]
 [ 1.10339277e+02  1.10217527e+02  1.10919336e+02  1.10397525e+02]
 [ 4.44718671e+01  2.27792250e+00  1.01998593e+02  2.15294750e+01]
 [ 9.90287289e+00  1.66715970e+00  9.76276353e-01 -9.51547077e-02]
 [ 2.86960141e+01  9.67722616e+01  2.66265532e+01  4.64866410e+00]
 [ 9.31411436e+01  1.08370863e+02  9.74102300e+01  7.91681531e+01]
 [ 1.09680905e+02  1.05050468e+02  1.05000188e+02  1.01450993e+02]
 [ 7.20504282e+01  1.02489592e+02  9.86204230e+01  1.09196971e+02]
 [ 1.52789481e+00  5.89275653e+01  7.75354808e+00  5.09868116e+00]
 [ 5.79797149e+01  1.05097046e+02  6.37558181e+01  2.93817074e+01]
 [ 1.00820183e+02  1.07250628e+02  6.71579553e+01  9.50645459e+01]
 [ 1.07799180e+02  1.03498382e+02  1.01028113e+02  1.02869130e+02]
 [ 1.01684047e+02  9.66543243e+01  8.9834661

In [7]:
# FIND ARGMAX POLICY 

import pandas as pd
# define column and index
columns=range(grid.size)
index = range(grid.size)
# define dataframe to represent policy table
policy_table = pd.DataFrame(index = index, columns=columns)

# iterate through Q matrix to find best action
# as action name (eg. left, right, up, down)
for state in range(grid.size*grid.size):
  
    # find the best action at each state
    best_action = np.argmax(Q_values[state])

    # get action name
    if best_action == 0:
        action_name = 'up'
    elif best_action == 1:
        action_name = 'right'
    elif best_action == 2:
        action_name = 'down'
    else:
        action_name = 'left'

    # calculate the row and column coordinate of the current state number
    row = int(state/grid.size)
    column = round((state/grid.size - int(state/grid.size))*grid.size)
            
    # assign action name
    policy_table.loc[row][column] = action_name

print("Policy Table: ")
print(policy_table)
print()

Policy Table: 
       0      1      2     3     4
0  right     up  right  down  down
1     up  right  right    up  left
2  right  right  right    up  left
3  right     up   left    up  left
4  right     up   left    up  left

