# Gridworld with First Visit

In [1]:
# Find the value function of policy
import numpy as np

# display output
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.pos_check = [0, 0] # a copy of new position
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):
        # return initial state
        return grid.states[gridSize*gridSize-1]
       
    def reward(self, current_pos, action):
        # return the reward        
        
        # take action in current pos
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            reward = 5
        return reward
    
    def p_transition(self, current_pos, action):
        # return the transition probability
        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)
        self.pos_check = self.new_pos # make a copy of new pos before being overwritten below

        # if taking an action crosses the border = agent stays in same position
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]
        return self.new_pos

In [4]:
# create a grid object
grid = Gridworld(5)

In [5]:
# get initial state (bottom right)
grid.initial_state()

[4, 4]

## First-visit MC Control 

In [6]:
# Initiate a random policy
random_policy = np.random.randint(1000, size=(state_count, action_count))
random_policy = random_policy/random_policy.sum(axis=1)[:,None]
policy = random_policy

In [7]:
# random policy
policy

array([[0.36597938, 0.00979381, 0.14742268, 0.47680412],
       [0.68743914, 0.1781889 , 0.05160662, 0.08276534],
       [0.04357067, 0.61742827, 0.01912859, 0.31987248],
       [0.03361345, 0.61038961, 0.05958747, 0.29640947],
       [0.37012987, 0.3284632 , 0.10119048, 0.20021645],
       [0.21594684, 0.00598007, 0.37873754, 0.39933555],
       [0.33679928, 0.3164557 , 0.16048825, 0.18625678],
       [0.17863954, 0.21614749, 0.50603942, 0.09917355],
       [0.44804011, 0.17639015, 0.19325433, 0.18231541],
       [0.28622715, 0.24640992, 0.15861619, 0.30874674],
       [0.04765507, 0.0204236 , 0.57942511, 0.35249622],
       [0.38526316, 0.24421053, 0.09010526, 0.28042105],
       [0.08879781, 0.40846995, 0.31215847, 0.19057377],
       [0.13599182, 0.37116564, 0.0398773 , 0.45296524],
       [0.11905713, 0.25209748, 0.37235318, 0.25649221],
       [0.22997416, 0.34409991, 0.30878553, 0.1171404 ],
       [0.41422959, 0.22697512, 0.20689655, 0.15189873],
       [0.09981025, 0.17229602,

### Create an Episode following policy

In [8]:
# set initial state
state = grid.initial_state()

# initialize state (with iniitial state), action list
state_list = [state]
action_list = []
reward_list = []

# generate an episode
for i in range(5):
    
    # pick an action based on categorical distribution in policy
    action = int(np.random.choice(action_count, 1, p=policy[grid.states.index(state)]))
    action = actions[action]
    
    reward = grid.reward(state, action)
#     print(state)
#     print(action)
#     print(reward)
    
    # get the new state with the chosen action
    new_state = list(grid.p_transition(state, action))
    state = new_state
   
    # save state and action to list
    state_list.append(state)
    action_list.append(action)
    
    # save reward to list
    reward_list.append(reward)

In [9]:
reward_list

[0, 0, -1, 0, 0]

In [10]:
state_list

[[4, 4], [4, 3], [4, 4], [4, 4], [3, 4], [2, 4]]

In [11]:
action_list

[[0, -1], [0, 1], [1, 0], [-1, 0], [-1, 0]]

### Initialize

In [78]:
# initialize q values for all state action pairs
Q_values = np.zeros((state_count, action_count))

### Loop

### Loop for each steo of episode 

In [79]:
# intialize parameters
G = 0
gamma = 0.99
epsilon = 0.2
Terminal = len(action_list)

# define lists
returns_list = []
visited_list = []

In [80]:
t_list=[]
for i in range(1,Terminal+1):
    t = Terminal - i
    t_list.append(t)

In [81]:
t_list

[4, 3, 2, 1, 0]

In [82]:
# define average function
def Average(lst): 
    return sum(lst) / len(lst) 

In [83]:
for t in t_list:
    
    print("t: ", t)
    
    # add to G
    G = gamma*G + reward_list[t]
    
    # combine state action pair
    visited = []
    visited.extend(state_list[t])
    visited.extend(action_list[t])
    
    # check if state action pair have been visited before
    if visited in visited_list:
        print("visited")
        print(visited)
        
    else:
        print("NOT visited")
        print(visited)
        
        # add state action pair to visited list
        visited_list.append(visited)
        
        # append G to returns
        returns_list.append(G)
        
        # find state and action index
        state_index = grid.states.index(state_list[t])
        action_index = actions.index(action_list[t])
        print("state_index: ", state_index)

        # write Q_values to the state-action pair
        Q_values[state_index][action_index] = Average(returns_list)
        print("average return: ", Average(returns_list))
    
        # choose best action at given state
        choose_action = np.argmax(Q_values[state_index])
        
        # overwrite policy
        for i in range(action_count):
            if choose_action == i:
                policy[state_index][i] = 1 - epsilon + epsilon/action_count
            else:
                policy[state_index][i] = epsilon/action_count

t:  4
NOT visited
[3, 4, -1, 0]
state_index:  19
average return:  0.0
t:  3
NOT visited
[4, 4, -1, 0]
state_index:  24
average return:  0.0
t:  2
NOT visited
[4, 4, 1, 0]
state_index:  24
average return:  -0.3333333333333333
t:  1
NOT visited
[4, 3, 0, 1]
state_index:  23
average return:  -0.4975
t:  0
NOT visited
[4, 4, 0, -1]
state_index:  24
average return:  -0.59402


In [84]:
Q_values

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.

In [85]:
policy

array([[0.36597938, 0.00979381, 0.14742268, 0.47680412],
       [0.68743914, 0.1781889 , 0.05160662, 0.08276534],
       [0.04357067, 0.61742827, 0.01912859, 0.31987248],
       [0.03361345, 0.61038961, 0.05958747, 0.29640947],
       [0.37012987, 0.3284632 , 0.10119048, 0.20021645],
       [0.21594684, 0.00598007, 0.37873754, 0.39933555],
       [0.33679928, 0.3164557 , 0.16048825, 0.18625678],
       [0.17863954, 0.21614749, 0.50603942, 0.09917355],
       [0.44804011, 0.17639015, 0.19325433, 0.18231541],
       [0.28622715, 0.24640992, 0.15861619, 0.30874674],
       [0.04765507, 0.0204236 , 0.57942511, 0.35249622],
       [0.38526316, 0.24421053, 0.09010526, 0.28042105],
       [0.08879781, 0.40846995, 0.31215847, 0.19057377],
       [0.13599182, 0.37116564, 0.0398773 , 0.45296524],
       [0.125     , 0.625     , 0.125     , 0.125     ],
       [0.22997416, 0.34409991, 0.30878553, 0.1171404 ],
       [0.41422959, 0.22697512, 0.20689655, 0.15189873],
       [0.09981025, 0.17229602,

# Testing 

In [None]:
state_list[-1]

In [None]:
action_list[-1]

In [None]:
G = gamma*G + reward_list
G

In [None]:
# combine state action pair
visited = []
visited.extend(state_list[-1])
visited.extend(action_list[-1])

In [None]:
visited

In [None]:
# check if state action pair have been visited before
if visited in visited_list:
    print("yes")
    
else:
    print("no")
    # add state action pair to visited list
    visited_list.append(visited)
    

In [None]:
visited_list

In [None]:
# append G to returns
returns_list.append(G)

In [None]:
returns_list

In [None]:
# define average function
def Average(lst): 
    return sum(lst) / len(lst) 

# find state and action index
state_index = grid.states.index(state_list[-1])
action_index = actions.index(action_list[-1])

# write Q_values to the state-action pair
Q_values[state_index][action_index] = Average(returns_list)

In [None]:
Q_values

In [None]:
# get state_number
state_number = grid.states.index(state_list[-1])

In [None]:
state_number

In [None]:
choose_action = np.argmax(Q_values[state_number])

In [None]:
choose_action

In [None]:
epsilon = 0.5

In [None]:
# overwrite policy
for i in range(action_count):
    if choose_action == i:
        policy[state_number][i] = 1 - epsilon + epsilon/action_count
    else:
        policy[state_number][i] = epsilon/action_count

In [None]:
policy[state_number]