 # GridWorld Simulation

In [1]:
import numpy as np
from collections import defaultdict

In [2]:
class Agent:
    def __init__(self, x, y):
        self.state = (x, y)
        self.pi = [.25, .25, .25, .25]
        self.actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]
                   
        # counters
        self.accum_rewards = defaultdict(int)
        self.cell_freqs = defaultdict(int)
    
    def choose_action(self):
        i = np.random.choice(range(len(self.actions)), p=self.pi)
        return self.actions[i], self.pi[i]
    
    def update_credit(self, s_temp, stplus1, r):
        # if it didin't move is because it jumped out of the grid
        if stplus1 == self.state:
            credit_cell = self.state
        else:
            credit_cell = s_temp            
            
        self.cell_freqs[credit_cell] += 1
        self.accum_rewards[credit_cell] += r 
        
        return credit_cell
    
    def update_state(self, stplus1):   
        self.state = stplus1

In [3]:
class Gridworld:
    def __init__(self, n):
        self.n = n
        
        # (i, j), 0,0) is the top-left corner of the grid
        self.A = [(0, 1), (4, 1), 10]
        self.B = [(0, 3), (2, 3), 5]
    
    def compute_reward(self, s_temp):
        r = 0
        
        if s_temp == self.A[0]:
            r = self.A[2]         
        elif s_temp == self.B[0]:
            r = self.B[2]          
        elif s_temp[0]<0 or s_temp[0]>self.n-1:
            r = -1           
        elif s_temp[1]<0 or s_temp[1]>self.n-1:
            r = -1
        
        return r
    
    
    def next_state(self, st, a):
        
        s_temp = (st[0] + a[0], st[1] + a[1])  # element-wise addition of tuples
        stplus1 = s_temp
        
        if stplus1 == self.A[0]:
            stplus1 = self.B[1]
            
        elif stplus1 == self.B[0]:
            stplus1 = self.B[1]
            
        elif stplus1[0]<0 or stplus1[0]>self.n-1:
            stplus1 = st
            
        elif stplus1[1]<0 or stplus1[1]>self.n-1:
            stplus1 = st
        
        return s_temp, stplus1


    def run(self, agent, num_steps):
        
        for _ in range(num_steps):
            a, p = agent.choose_action()
            s_temp, stplus1 = self.next_state(agent.state, a)   
            r = self.compute_reward(s_temp)
            agent.update_credit(s_temp, stplus1, r)
            agent.update_state(stplus1)
   

# Simulation

In [4]:
n = 5
steps = 10000

grid_world = Gridworld(n)
agent = Agent(2, 2)
grid_world.run(agent, steps)

In [5]:
r = np.zeros(shape=(n, n))
f = np.zeros(shape=(n, n))
g = np.zeros(shape=(n, n))
for s in agent.cell_freqs.keys():
    r[s] = agent.accum_rewards[s] 
    f[s] = agent.cell_freqs[s]
    g[s] = agent.accum_rewards[s] / float(max(1, agent.cell_freqs[s]))

print np.round(g, decimals=2)


[[ -0.45  10.    -0.23   5.    -0.52]
 [ -0.24   0.     0.     0.    -0.25]
 [ -0.23   0.     0.     0.    -0.25]
 [ -0.28   0.     0.     0.    -0.23]
 [ -0.51  -0.25  -0.29  -0.23  -0.51]]
