In [1]:
import random
import numpy as np

# Environment: Grid World

In [2]:
class GridWorld():
    def __init__(self): # Set initial point
        self.x=0
        self.y=0
        
    def step(self, action): # Move according to the selected action & Get reward
        if action==0:
            self.move_left()
        elif action==1:
            self.move_right()
        elif action==2:
            self.move_up()
        elif action==3:
            self.move_down()
            
        reward = -1 # Get reward -1 for each step
        done = self.is_done() # When we reach (3,3), done becomes true
        
        return (self.x, self.y), reward, done # Retrun (state, reward, whether the game is finished or not)
    
    def move_left(self):
        self.y -= 1
        if self.y < 0:
            self.y = 0
            
    def move_right(self):
        self.y += 1
        if self.y > 3:
            self.y = 3
            
    def move_up(self):
        self.x -= 1
        if self.x < 0:
            self.x = 0
        
    def move_down(self):
        self.x += 1
        if self.x > 3:
            self.x = 3

        
    def is_done(self): # When we reach (3,3), done becomes true
        if self.x==3 and self.y==3:
            return True
        else:
            return False
        
    def get_state(self): # Return current position
        return (self.x, self.y)
    
    def reset(self):
        self.x=0
        self.y=0
        return (self.x, self.y)

# Agent: Random policy

In [3]:
class Agent():
    def __init__(self):
        pass
    
    def random_action(self): # Unifomly random policy
        rand = random.random()
        if rand < 1/4:
            action=0
        elif rand < 1/2:
            action=1
        elif rand < 3/4:
            action=2
        else:
            action=3
        return action

# Run Simulation

In [4]:
def main():
    env = GridWorld() # make GridWorld environment
    agent = Agent() # Make an agent
    value_f = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]] # Initial value functions
    gamma = 1 # discount factor
    alpha = 0.001 # constant step size
    
    for e in range(50000): # Run 50000 episodes
        done = False
        while not done:
            x_current, y_current = env.get_state() # get current state
            action = agent.random_action() # agent selects an action
            (x_next,y_next), reward, done = env.step(action) # get next state and reward
            
            #update value function at every step
            value_f[x_current][y_current] = value_f[x_current][y_current] \
            + alpha * (reward + gamma * value_f[x_next][y_next] - value_f[x_current][y_current])
        
        env.reset() # Set initial position when an episode ended
        

        
    #print data
    np.set_printoptions(precision=2)
    np.set_printoptions(suppress=True)
    print(np.array(value_f))
        
        
if __name__ == '__main__':
    main()
            
            

[[-57.9  -56.01 -53.   -50.45]
 [-55.98 -53.17 -48.75 -44.26]
 [-52.84 -48.49 -40.11 -29.62]
 [-50.25 -44.06 -29.5    0.  ]]
