# Note on Gridworld Action Space Restriction
There is a problem, potentially, since staying in a location is not an allowable action (technically) but an agent could repeatedly choose an invalid action (if the invalid action is selectable in the first place). So I need to further prune the set of actions, depending on the state, but I do not want to do so in a way that is manual.

The way to do this is to count the number of non-corner edge states and account for that in the allowable set of actions. Since the area of the grid is length^2, the perimeter is 4$*$length, and the number of non-corner edge states is therefore 4$*$(length-1) and the number of corners is 4. Therefore, the number of allowable actions (dependent on state) is: $\left(4*{length}^2-4*length\right)*4+4*(length-1)*3+4*2$ so for $length=5$ we have $(100-20)*4+16*3+8=320+48+8=376$ actions.

## SARSA Implementation
To enable graphical output, have the render_init variable = 't' on line 121

In [None]:
import numpy as np
a = np.array([[1,2],[3,0]])
print(np.flipud(a))
#np.array([[-1,-1,-1,-1,-1,-1,-1],[-1, 0, 10, 0, 5, 0, -1]])
b=np.array([ [-1,-1,-1,-1,-1,-1,-1],[-1, 0, 10, 0, 5, 0, -1],[-1, 0, 0, 0, 0, 0, -1],\
    [-1, 0, 0, 0, 0, 0, -1],[-1, 0, 0, 0, 0, 0, -1],[-1, 0, 0, 0, 0, 0, -1],[-1, -1, -1, -1, -1, -1, -1] ]) 
c = np.flipud(b)
print(c)
print(b[1,2])

In [2]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import matplotlib.ticker as plticker

class gridworld(gym.Env):
  metadata = {'render.modes': ['human']}

  def __init__(self): 
    self.gridnum = int(7) #size of gridworld
    self.location_x = np.random.randint(int(1),self.gridnum-1)
    self.location_y = np.random.randint(int(1),self.gridnum-1)
    self.action = int(0) # no initial action until computed
    self.previous_action = self.action
    self.previous_x = self.location_x
    self.previous_y = self.location_y
    self.previous_previous_x = self.previous_x
    self.previous_previous_y = self.previous_y
    self.allowed_actions = np.array([int(0)])
    self.actionSet = np.matrix([1,2,3,4])
    self.episode_length = int(100)
    self.num_episodes = int(10000) # currently only running one episode
    self.my_alpha = 0.1
    self.my_gamma = 0.9
    self.my_epsilon = 0.1
    self.my_reward =np.array([ [-1,-1,-1,-1,-1,-1,-1],[-1, 0, 10, 0, 5, 0, -1],[-1, 0, 0, 0, 0, 0, -1],\
    [-1, 0, 0, 0, 0, 0, -1],[-1, 0, 0, 0, 0, 0, -1],[-1, 0, 0, 0, 0, 0, -1],[-1, -1, -1, -1, -1, -1, -1] ]) 
    self.my_reward = np.flipud(self.my_reward)
    self.my_reward_model = np.zeros([self.gridnum,self.gridnum])#reward model updated based on observations
    self.my_q_function = np.random.rand(self.gridnum,self.gridnum, int(4))# randomly initialized via reward model
    self.my_reward_log = np.random.rand(1, self.episode_length*self.num_episodes) # used to store reward for each time step
    self.my_episodic_cumulative_reward_log = np.random.rand(1,self.num_episodes)
    pass
  def get_allowed_actions(self):
    if self.location_x == 0 and self.location_y == 0:
        self.allowed_actions = np.array([1,4])[np.newaxis]
    elif self.location_x == 0 and self.location_y == self.gridnum - 1:
        self.allowed_actions = np.array([2,4])[np.newaxis]
    elif self.location_x == self.gridnum - 1 and self.location_y == 0:
        self.allowed_actions = np.array([1,3])[np.newaxis]
    elif self.location_x == self.gridnum - 1 and self.location_y == self.gridnum - 1:
        self.allowed_actions = np.array([2,3])[np.newaxis]
    elif self.location_x == 0:
        self.allowed_actions = np.array([1,2,4])[np.newaxis]
    elif self.location_x == self.gridnum - 1:
        self.allowed_actions = np.array([1,2,3])[np.newaxis]
    elif self.location_y == 0:
        self.allowed_actions = np.array([1,3,4])[np.newaxis]
    elif self.location_y == self.gridnum - 1:
        self.allowed_actions = np.array([2,3,4])[np.newaxis]
    else:
        self.allowed_actions = np.array([1,2,3,4])[np.newaxis]
    pass
  def my_policy(self):
    if self.previous_x != self.location_x and self.previous_y != self.location_y:
        self.previous_action = self.action # only accurate for second step in episode
    self.action = self.allowed_actions[0,\
    np.argmax(self.my_q_function[self.location_x,self.location_y,self.allowed_actions-1])]
    if np.random.rand() <= self.my_epsilon:
        self.action = self.allowed_actions[0,np.random.randint(0,self.allowed_actions.shape[1])]
    pass
  def render(self,render_label,fig,ax): #mode='human', close=False <- no idea what this is for
    if render_label == 't':
        ax = fig.gca()    
        ax.clear()
        ax.grid(which='major', axis='both', linestyle='-')
        circle2 = plt.Circle((world.location_x+0.5, world.location_y+0.5), 0.5, color='blue')#rand initialization
        ax.add_artist(circle2)
        fig.canvas.draw()
    pass
  def render_init(self,render_label):
    if render_label == 't':
        fig, (ax)=plt.subplots()
        intervals = float(1/world.gridnum)# dimension of grid affects size
        loc = plticker.MultipleLocator(base=intervals)
        ax.xaxis.set_major_locator(loc)
        ax.set_xlim(0, world.gridnum)
        ax.yaxis.set_major_locator(loc)
        ax.set_ylim(0, world.gridnum)
        return fig, ax
    pass
  def reset(self):
    self.location_x = np.random.randint(int(1),self.gridnum-1)
    self.location_y = np.random.randint(int(1),self.gridnum-1)
    self.previous_x = self.location_x
    self.previous_y = self.location_y
    pass
  def step(self):
    self.previous_previous_x = self.previous_x
    self.previous_previous_y = self.previous_y
    self.previous_x = self.location_x# only accurate for second step in episode
    self.previous_y = self.location_y# only accurate for second step in episode
    desired_action = self.action
    if np.random.rand() <= 0.1: # this part of the method is to enforce a 10% chance of a random transition
        self.action = self.allowed_actions[0,np.random.randint(1,self.allowed_actions.shape[1])]
    if self.action == 1: # this part of the method is to select the desired deterministic action
        self.location_y += 1
    elif self.action == 2:
        self.location_y += -1
    elif self.action == 3:
        self.location_x += -1
    elif self.action == 4:
        self.location_x += 1
    self.action = desired_action
    pass #return [(self.location_x, self.location_y)]
  def update_reward_model(self):
    self.my_reward_model[self.location_x,self.location_y]\
    = self.my_reward[self.location_y,self.location_x]
    pass
  def update_my_q_function(self):#,action,location_x,location_y):
    self.my_q_function[self.previous_x,self.previous_y,self.previous_action-1] +=\
    self.my_alpha*(self.my_reward_model[self.location_x,self.location_y]+\
    self.my_gamma*self.my_q_function[self.location_x,self.location_y,self.action-1]-\
    self.my_q_function[self.previous_x,self.previous_y,self.previous_action-1])
    pass
fig1, (ax1)=plt.subplots()
world = gridworld()
render_label = 'f'
if render_label == 't':
    fig, (ax) = world.render_init(render_label)
k=0 # counter for episodic cumulative reward
for i in range(0,world.episode_length * world.num_episodes - 1):
    world.update_reward_model()
    world.my_reward_log[0,i] = world.my_reward[world.location_y,world.location_x]
    world.update_my_q_function()#update is for previous state, so put before state reversion
    if world.my_reward[world.location_y,world.location_x] < 0:
        world.location_x = world.previous_x
        world.previous_x = world.previous_previous_x
        world.location_y = world.previous_y
        world.previous_y = world.previous_previous_y
    elif world.my_reward_log[0,i] > 5:
        world.location_x = 1+1
        world.location_y = 0+1
    elif world.my_reward_log[0,i] > 0:
        world.location_x = 3+1
        world.location_y = 2+1
    if render_label == 't':
        world.render(render_label,fig,ax)
    world.get_allowed_actions()
    world.my_policy()
    world.step()
    if np.mod(i+1,world.episode_length) == 0:
        world.my_episodic_cumulative_reward_log[0,k] = \
        np.sum(world.my_reward_log[0,(k*world.episode_length):(i+1)])# sums from k*episode_length to i
        if np.mod(i+1,np.floor(0.1*world.episode_length*world.num_episodes)-1) == 0:
            print(np.floor(100*i/(world.episode_length*world.num_episodes)))
        k += 1
        world.reset()
ax1.plot(world.my_episodic_cumulative_reward_log[0,0:-1])
plt.xlabel('episode number')
plt.ylabel('total episodic reward')

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7fd1d49271d0>