# Gridworld

In [1]:
# Find the value function of policy
import numpy as np

# display output
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):        # return initial state
        return grid.states[gridSize*gridSize-1] # ? self.states
   
    def transition_reward(self, current_pos, action): # return the transition probability

        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, receive + 10
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, receive + 5
        if current_pos == [0, 3]:
            reward = 5

        # if taking an action crosses the border; agent's new_pos is the same as the current pos
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]

        return self.new_pos, reward

In [4]:
# create a grid object
grid = Gridworld(5)

In [5]:
# get initial state (bottom right)
grid.initial_state()

[4, 4]

Initialize a random policy

In [6]:
# Initiate a random policy
random_policy = np.random.randint(1000, size=(state_count, action_count))
random_policy = random_policy/random_policy.sum(axis=1)[:,None]
policy = random_policy

In [7]:
# random policy
policy

array([[0.28571429, 0.34325397, 0.20337302, 0.16765873],
       [0.71367521, 0.04188034, 0.21623932, 0.02820513],
       [0.52989935, 0.22439313, 0.15748964, 0.08821788],
       [0.43598972, 0.33470437, 0.18200514, 0.04730077],
       [0.29850142, 0.30376671, 0.3799109 , 0.01782098],
       [0.32075472, 0.03054807, 0.56693621, 0.08176101],
       [0.37383635, 0.23272905, 0.27143557, 0.12199902],
       [0.14667941, 0.27806976, 0.23076923, 0.34448161],
       [0.30411449, 0.1842576 , 0.16744186, 0.34418605],
       [0.25082978, 0.26126126, 0.46277857, 0.02513039],
       [0.33826248, 0.18558226, 0.12791128, 0.34824399],
       [0.37379868, 0.30197269, 0.19018715, 0.13404148],
       [0.41270509, 0.31131679, 0.16323096, 0.11274716],
       [0.44218316, 0.34273821, 0.11979648, 0.09528215],
       [0.06121045, 0.42365887, 0.50343879, 0.01169188],
       [0.37399747, 0.06922752, 0.26973407, 0.28704095],
       [0.08014862, 0.38163482, 0.16613588, 0.37208068],
       [0.0212766 , 0.22727273,

### Create an Episode following policy

In [8]:
def generate_episode(steps):

    # set initial state
    state = grid.initial_state()

    # initialize state (with iniitial state), action list and reward list
    state_list = [state]
    action_list = []
    reward_list = []

    # generate an episode
    for i in range(steps):

        # pick an action based on categorical distribution in policy
        action = int(np.random.choice(action_count, 1, p=policy[grid.states.index(state)])) # get index in int
        action = actions[action] # convert the integer index (ie. 0) to action (ie. [-1, 0])

        # get reward (integer value)
        reward = grid.reward(state, action)

        # get the new state with the chosen action
        new_state = list(grid.p_transition(state, action)) # (ie. [1,0])
        state = new_state # set the next state as the current state

        # save state, action chosen and reward to list
        state_list.append(state)
        action_list.append(action)
        reward_list.append(reward)
        
    return state_list, action_list, reward_list

### SARSA(Lamda)

In [9]:
# initialize q values for all state action pairs
Q_values = np.random.randint(0,1000,size = (state_count, action_count))
Q_values

array([[  4,  88, 727,  28],
       [568, 664, 725, 219],
       [451, 519, 517, 494],
       [378, 854, 118, 563],
       [682, 610,  58, 184],
       [285, 232, 476,  58],
       [231, 132, 267,  18],
       [499, 751, 621, 732],
       [964, 649, 746, 329],
       [581, 128, 639, 451],
       [617, 131,  76, 556],
       [988,  56,  36, 853],
       [762, 284, 314, 380],
       [507, 256, 962, 938],
       [575, 407, 330, 936],
       [506, 574, 880, 246],
       [888, 161, 577, 324],
       [658, 810, 352, 636],
       [334, 182, 822, 520],
       [140, 130, 459,  97],
       [839,  99, 176, 376],
       [983, 562, 287, 829],
       [880, 590, 614, 949],
       [238, 586, 967, 201],
       [769, 543, 989,  66]])

In [10]:
# intialize parameters
gamma = 0.99
epsilon = 0.2
lamda = 0.9
alpha = 0.1

In [11]:
# iterate 500 times: each time, generating an episode of 200 steps
max_steps = 200

# define variables for keeping track of time steps
Terminal = max_steps
t_list=[]
for i in range(1,max_steps+1):
    t = Terminal - i
    t_list.append(t)

In [12]:
def choose_action(state, epsilon):
    
    # choose an action type: explore or exploit
    #action_type = int(np.random.choice(2, 1, p=[epsilon,1-epsilon]))
    action_type = int(np.random.choice(2, 1, p=[(len(actions)-1)*(epsilon/len(actions)),1-(len(actions)-1)*(epsilon/len(actions))]))

    # find best action based on Q values
    best_action_index = np.argmax(Q_values[state])

    # choose an action based on exploit or explore
    if action_type == 0:
        # explore
        # print("explore")
        
        # pick a random action
        random_action_index = np.random.choice(range(4))
        
        # while random action is the same as the best action, pick a new action
        while random_action_index == best_action_index:
            random_action_index = np.random.choice(range(4))
        
        action_index = random_action_index
    else:
        # exploit
        # print("exploit")
        action_index = best_action_index
        
    return action_index

In [13]:
#action_type = int(np.random.choice(2, 1, p=[epsilon,1-epsilon]))
#action_type

In [14]:
# iteration 500 times
for iteration in range(500):
    
    # initialize delta
    delta = 0
    
    # initialize S,A (? should i choose an Action using epsilon-greedy here or just select an Action?)
    state_vector = grid.initial_state()
    state_index = grid.states.index(state_vector)
    
    # initialize  eligibility traces for all state action pairs of all states to 0
    z_values = np.zeros((state_count, action_count))
    
    action_index = choose_action(state_index, epsilon)
    action_vector = actions[action_index]
    
    # iteration 200 steps of the episode
    for i in range(max_steps):
        
        # Take action A, oberserve R, S'
        next_state_vector, reward = grid.transition_reward(state_vector, action_vector)
        next_state_index = grid.states.index(list(next_state_vector))
        
        # Choose A' from S' using policy derived from Q (eg. epsilon-greedy)
        next_action_index = choose_action(next_state_index, epsilon)
        next_action_vector = actions[next_action_index]
        
        # update the action-value form of the TD error
        delta = reward + gamma*Q_values[next_state_index][next_action_index] - Q_values[state_index][action_index]
        
        # accumulate traces (? big S and big A?)
        z_values[state_index][action_index] +=1
        
        # update Q value
        Q_values[state_index][action_index] = Q_values[state_index][action_index] + alpha*delta*z_values[state_index][action_index]
        
        # update z value
        z_values[state_index][action_index] = gamma*lamda*z_values[state_index][action_index]
        
        # update state and action vector
        state_vector = list(next_state_vector)
        state_index = grid.states.index(state_vector)
        action_vector = list(next_action_vector)
        action_index = next_action_index

In [15]:
np.set_printoptions(precision=2)
Q_values

array([[ 7, 17,  9,  8],
       [11, 11, 10, 10],
       [10,  8,  7, 14],
       [ 8,  8,  8,  8],
       [ 6,  6,  7,  8],
       [11,  7,  4,  7],
       [20,  6,  4,  6],
       [12,  6,  6,  7],
       [ 7,  6,  3,  7],
       [ 5,  5,  4,  6],
       [ 7,  5,  3,  5],
       [17,  6,  1,  3],
       [ 9,  4,  4,  5],
       [ 4,  4,  3,  5],
       [ 4,  4,  4,  3],
       [ 4,  4,  0,  3],
       [ 7,  3,  0,  2],
       [ 7,  3,  3,  4],
       [ 3,  3,  3,  4],
       [ 3,  2,  2,  3],
       [ 1,  1,  0,  0],
       [ 7,  0,  4,  0],
       [ 2,  2,  1,  2],
       [ 3,  3,  2,  3],
       [ 2,  0,  0,  2]])

In [16]:
# PRINT POLICY TABLE ################################################################################
# import pandas library
import pandas as pd
# define column and index
columns=range(grid.size)
index = range(grid.size)
# define dataframe to represent policy table
policy_table = pd.DataFrame(index = index, columns=columns)

# iterate through policy to make a table that represents action number
# as action name (eg. left, right, up, down)
for state in range(len(Q_values)):
    
    # find the best action at each state
    best_action = np.argmax(Q_values[state])

    # get action name
    if best_action == 0:
        action_name = 'up'
    elif best_action == 1:
        action_name = 'right'
    elif best_action == 2:
        action_name = 'down'
    else:
        action_name = 'left'

    # calculate the row and column coordinate of the current state number
    row = int(state/grid.size)
    column = round((state/grid.size - int(state/grid.size))*grid.size)
            
    # assign action name
    policy_table.loc[row][column] = action_name

print("Policy Table: ")
print(policy_table)
print()

Policy Table: 
       0   1     2     3     4
0  right  up  left    up  left
1     up  up    up    up  left
2     up  up    up  left    up
3     up  up    up  left    up
4     up  up    up    up    up

