In [1]:
# import common packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Environment description
rewards={(0,3):1,(1,3):-1}

actions={
    (2,0):['U','R'],
    (1,0):['U','D'],
    (0,0):['R','D'],
    (2,1):['R','L'],
    (0,1):['R','L'],
    (2,2):['U','R','L'],
    (1,2):['U','D','R'],
    (0,2):['R','L','D'],
    (2,3):['L'],
}

probs = {
    ((2, 0), 'U'): {(1, 0): 1.0},
    ((2, 0), 'D'): {(2, 0): 1.0},
    ((2, 0), 'L'): {(2, 0): 1.0},
    ((2, 0), 'R'): {(2, 1): 1.0},
    ((1, 0), 'U'): {(0, 0): 1.0},
    ((1, 0), 'D'): {(2, 0): 1.0},
    ((1, 0), 'L'): {(1, 0): 1.0},
    ((1, 0), 'R'): {(1, 0): 1.0},
    ((0, 0), 'U'): {(0, 0): 1.0},
    ((0, 0), 'D'): {(1, 0): 1.0},
    ((0, 0), 'L'): {(0, 0): 1.0},
    ((0, 0), 'R'): {(0, 1): 1.0},
    ((0, 1), 'U'): {(0, 1): 1.0},
    ((0, 1), 'D'): {(0, 1): 1.0},
    ((0, 1), 'L'): {(0, 0): 1.0},
    ((0, 1), 'R'): {(0, 2): 1.0},
    ((0, 2), 'U'): {(0, 2): 1.0},
    ((0, 2), 'D'): {(1, 2): 1.0},
    ((0, 2), 'L'): {(0, 1): 1.0},
    ((0, 2), 'R'): {(0, 3): 1.0},
    ((2, 1), 'U'): {(2, 1): 1.0},
    ((2, 1), 'D'): {(2, 1): 1.0},
    ((2, 1), 'L'): {(2, 0): 1.0},
    ((2, 1), 'R'): {(2, 2): 1.0},
    ((2, 2), 'U'): {(1, 2): 1.0},
    ((2, 2), 'D'): {(2, 2): 1.0},
    ((2, 2), 'L'): {(2, 1): 1.0},
    ((2, 2), 'R'): {(2, 3): 1.0},
    ((2, 3), 'U'): {(1, 3): 1.0},
    ((2, 3), 'D'): {(2, 3): 1.0},
    ((2, 3), 'L'): {(2, 2): 1.0},
    ((2, 3), 'R'): {(2, 3): 1.0},
    ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
    ((1, 2), 'D'): {(2, 2): 1.0},
    ((1, 2), 'L'): {(1, 2): 1.0},
    ((1, 2), 'R'): {(1, 3): 1.0},
  }

'''
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U'
  }
'''

"\npolicy = {\n    (2, 0): 'U',\n    (1, 0): 'U',\n    (0, 0): 'R',\n    (0, 1): 'R',\n    (0, 2): 'R',\n    (1, 2): 'R',\n    (2, 1): 'R',\n    (2, 2): 'R',\n    (2, 3): 'U'\n  }\n"

In [3]:
# define the gridworld  class
class GridWorld():
    
    def __init__(self, rows, columns, start_position):
        self.rows = rows
        self.columns = columns
        #self.all_states = [(i,j) for i in range(rows) for j in range(columns)]
        self.i = start_position[0]
        self.j = start_position[1]
        
    def set_rewards_actions(self, rewards, actions, probs):
        self.rewards = rewards
        self.actions = actions
        self.probs = probs
        self.all_states = set(self.actions.keys()) | set(self.rewards.keys())
        #print (self.all_states)
    
    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]
    
    def current_state(self):
        return self.i,self.j
    
    
    def undo_move(self, action):
        if action in self.actions[(self.i,self.j)]:
            if action == 'U':
                self.i += 1
            elif action == 'R':
                self.j -= 1
            elif action == 'L':
                self.j += 1
            else:
                self.i -= 1
        # should never happen
        assert (self.current_state() in self.all_states)
 
    def move(self, action):
        cur_state = (self.i, self.j)
        a = action
        next_action_prob = self.probs[(cur_state,a)]
        next_actions = list(next_action_prob.keys())
        next_probs = list(next_action_prob.values())
        next_state_idx = np.random.choice(len(next_actions), p=next_probs)
        self.i = next_actions[next_state_idx][0]
        self.j = next_actions[next_state_idx][1]
        return self.rewards.get((self.i,self.j),0)

    def is_terminal (self, s):
        return s not in self.actions
    
    def game_over(self):
        return (self.i,self.j) in self.actions

In [83]:
SMALL_ENOUGH = 1e-3

def print_values(V,g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.columns):
            v = V.get((i,j),0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print ("")

def print_policy(P,g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.columns):
            a = P.get((i,j),' ')
            print(" %s |" % a, end="")
        print ("")

ACTION_SPACE = ('U', 'D', 'L', 'R')

def init_grid_world_penalized(step_cost, start):
    grid = GridWorld(3,4,start)
    rewards = {
        (2,0):step_cost,
        (1,0):step_cost,
        (0,0):step_cost,
        (2,1):step_cost,
        (0,1):step_cost,
        (2,2):step_cost,
        (1,2):step_cost,
        (0,2):step_cost,
        (2,3):step_cost,
        (0,3):1,
        (1,3):-1
    }
    grid.set_rewards_actions(rewards, actions, probs)
    return grid

def play_game(step_cost, policy, max_iteration):
    
    t = 0
    
    # select a rando position to start the game
    start_s = (np.random.randint(0,3),np.random.randint(0,4))
    #print(f"start position {start_s}")
    g = init_grid_world_penalized(step_cost, start_s)
    #print_policy(policy, g)

    cur_game_states_actions=[]
    cur_game_rewards=[0]

    #print(g.game_over())

    #play one episode
    while g.game_over() and t < max_iteration:
        s = g.current_state()
        #print(f"current position {s}")
        a = policy[s]
        cur_game_states_actions.append((s,a))
        r = g.move(a)
        #print(f"reward {r}")
        cur_game_rewards.append(r)
        t += 1
        
    s = g.current_state()
    cur_game_states_actions.append((s,'')) 
    
    return cur_game_states_actions, cur_game_rewards


def main(step_cost, tot_iteration):
    
    gamma = 0.9    
        
    # initialize G and returns
    g = init_grid_world_penalized(step_cost, (2,0))
    Q = {}
    returns = {}
    for s in g.all_states:
        for a in ACTION_SPACE:
            Q[s,a]=[0,1]
            returns[s,a]=[]
    
    it = 0
    best_policy = {}
    #loop on 100 iteration
    for t in range(tot_iteration):
    
        #initialize a random policy
        policy = {}
        for state, v in actions.items():
            policy[state] = np.random.choice(v)
        #print_policy(policy,g)
    
        # play one game based on a random policy
        cur_game_states_actions, cur_game_rewards = play_game(step_cost, policy, 20)
    
        #compute G and V based on results
        G = 0
        
        game_length = len(cur_game_states_actions)
        for i in range(game_length-1,-1,-1): 
            G = cur_game_rewards[i]+gamma*G
            last_state, last_action = cur_game_states_actions[i-1]
            if not g.is_terminal(last_state):
                if (last_state, last_action) not in cur_game_states_actions.pop():
                    returns[last_state, last_action].append(G)
                    #print(f"{last_state}, {last_action} : {returns[last_state, last_action][-1]}, {Q[last_state, last_action]}")
                    Q[last_state, last_action] = [(returns[last_state, last_action][-1]+ (Q[last_state, last_action][1]*Q[last_state, last_action][0]))/(Q[last_state, last_action][1] + 1),Q[last_state, last_action][1] + 1]
                    q_old = float('-inf')
                    v = g.actions[last_state]
                    #print(v)
                    for a in v:
                        q = Q[last_state, a][0]
                        if q > q_old:
                            best_policy[last_state]=a
                            q_old = q

        it += 1
    print ("iter: ", it)
    print_policy(best_policy,g)
        
    print("\n\n")
        

In [88]:
main(-0.05, 10000)

iter:  10000
---------------------------
 R | R | R |   |
---------------------------
 U |   | D |   |
---------------------------
 U | L | R | L |



