In [42]:
import numpy as np
import sys,os
curr_path = os.path.abspath('')
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path)
from SimpleGrid import DrunkenWalkEnv

In [43]:
def all_seed(env,seed = 1):
    ## fixed random seed
    import numpy as np
    import random
    import os
    env.seed(seed) 
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) 

In [44]:
env = DrunkenWalkEnv(map_name = "walkInThePark")
all_seed(env, seed = 1) # random seed = 1

Value iteration

In [45]:

def value_iteration(env, theta=0.005, discount_factor=0.9):
    Q = np.zeros((env.nS, env.nA)) # initilize Q table
    count = 0
    while True:
        delta = 0.0
        Q_tmp = np.zeros((env.nS, env.nA))
        for state in range(env.nS):
            for a in range(env.nA):
                accum = 0.0
                reward_total = 0.0
                for prob, next_state, reward, done in env.P[state][a]:
                    accum += prob* np.max(Q[next_state, :])
                    reward_total += prob * reward
                Q_tmp[state, a] = reward_total + discount_factor * accum
                delta = max(delta, abs(Q_tmp[state, a] - Q[state, a]))
        Q = Q_tmp
        
        count += 1
        # Even if the algorithm does not converge, it exits the loop after 100 iterations
        if delta < theta or count > 100: 
            break 
    return Q

In [46]:
Q = value_iteration(env)
print(Q)

[[3.26467823e+21 5.62729296e+21 5.77922877e+21 3.28638335e+21]
 [4.18843433e+21 1.11513194e+22 1.18912103e+22 6.44946206e+21]
 [8.67612526e+21 2.33922356e+22 2.45019482e+22 1.32817060e+22]
 [1.79833197e+22 4.90720763e+22 5.03581793e+22 2.72849895e+22]
 [3.71877755e+22 1.02732970e+23 1.03193762e+23 5.58855418e+22]
 [7.60768450e+22 2.09702086e+23 2.10795349e+23 1.14177647e+23]
 [1.56410759e+23 4.30646719e+23 3.85575043e+23 2.27500330e+23]
 [3.04932301e+23 7.67812539e+23 4.43646806e+23 3.77521058e+23]
 [6.22577683e+21 1.14691507e+22 1.11012751e+22 4.16672922e+21]
 [7.84736370e+21 2.32519074e+22 2.33197597e+22 8.00589197e+21]
 [1.61660114e+22 4.90075844e+22 4.90770639e+22 1.65928240e+22]
 [3.39842304e+22 1.03314869e+23 1.03095989e+23 3.44047343e+22]
 [7.14331565e+22 2.17063264e+23 2.11176496e+23 7.05494825e+22]
 [1.34864828e+23 4.42981419e+23 4.25882293e+23 1.32286125e+23]
 [3.06066437e+23 9.24423623e+23 8.29997764e+23 2.88227320e+23]
 [6.17388763e+23 1.71647286e+24 9.43248361e+23 4.517007

In [47]:
policy = np.zeros([env.nS, env.nA]) # initialize a policy table
for state in range(env.nS):
    best_action = np.argmax(Q[state, :]) # select policy according Q table given by value iternation
    policy[state, best_action] = 1

policy = [int(np.argwhere(policy[i]==1)) for i in range(env.nS) ]
print(policy)

[2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1]


test?

In [48]:
num_episode = 1000 # test 1000 times
def test(env,policy):
    
    rewards = []  # rewards for all rounds
    success = []  # whether the round is successful to the end
    for i_ep in range(num_episode):
        ep_reward = 0  # the reward for each episode
        state = env.reset()  # reset environment -> restart a new episode, state=0
        while True:
            action = policy[state]  # choose an action
            next_state, reward, done, _ = env.step(action)  # interact with the environment
            state = next_state  # update state
            ep_reward += reward
            if done:
                break
        if state==47: # reach the end
            success.append(1)
        else:
            success.append(0)
        rewards.append(ep_reward)
    acc_suc = np.array(success).sum()/num_episode
    print("The successful rate of the test：", acc_suc)
    

In [49]:
test(env,policy)

The successful rate of the test： 0.901
