In [1]:
import numpy as np
import sys,os
curr_path = os.path.abspath('')
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path)
from SimpleGrid import DrunkenWalkEnv

In [2]:
def all_seed(env,seed = 1):
    ## fixed random seed
    import numpy as np
    import random
    import os
    env.seed(seed) 
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) 

In [3]:
env = DrunkenWalkEnv(map_name = "walkInThePark")
all_seed(env, seed = 1) # random seed = 1

  deprecation(


Value iteration

In [4]:

def value_iteration(env, theta=0.005, discount_factor=0.9):
    Q = np.zeros((env.nS, env.nA)) # initilize Q table
    count = 0
    while True:
        delta = 0.0
        Q_tmp = np.zeros((env.nS, env.nA))
        for state in range(env.nS):
            for a in range(env.nA):
                accum = 0.0
                reward_total = 0.0
                for prob, next_state, reward, done in env.P[state][a]:
                    accum += prob* np.max(Q[next_state, :])
                    reward_total += prob * reward
                Q_tmp[state, a] = reward_total + discount_factor * accum
                delta = max(delta, abs(Q_tmp[state, a] - Q[state, a]))
        Q = Q_tmp
        
        count += 1
        # Even if the algorithm does not converge, it exits the loop after 100 iterations
        if delta < theta or count > 100: 
            break 
    return Q

In [5]:
Q = value_iteration(env)
print(Q)

[[1.23719387e+20 1.82636701e+20 1.23719387e+20 1.93254119e+20]
 [2.61716488e+20 3.67074008e+20 1.70773081e+20 4.07982220e+20]
 [5.72302298e+20 8.18297648e+20 3.68942124e+20 8.88139558e+20]
 [1.25285719e+21 1.83061666e+21 8.09487798e+20 1.93498694e+21]
 [2.74346159e+21 4.09889186e+21 1.77745541e+21 4.21584138e+21]
 [5.97711269e+21 9.18524478e+21 3.87243020e+21 8.92890949e+21]
 [1.26283608e+22 1.94070718e+22 8.29912881e+21 1.88916901e+22]
 [2.42792050e+22 4.10314670e+22 1.51206638e+22 2.05776617e+22]
 [1.67805357e+20 3.82913244e+20 2.48131345e+20 3.65746831e+20]
 [3.54587441e+20 7.46554902e+20 3.43970023e+20 7.91550449e+20]
 [7.83532505e+20 1.66680989e+21 7.42624293e+20 1.77008196e+21]
 [1.73415136e+21 3.73992802e+21 1.66430945e+21 3.96941085e+21]
 [3.83998702e+21 8.39323247e+21 3.73561674e+21 8.90911318e+21]
 [6.00915460e+21 1.99985690e+22 5.89220509e+21 2.00123096e+22]
 [1.79685960e+22 4.22483439e+22 1.82249313e+22 3.99186912e+22]
 [3.27215062e+22 8.93253115e+22 3.32368879e+22 4.224070

In [6]:
policy = np.zeros([env.nS, env.nA]) # initialize a policy table
for state in range(env.nS):
    best_action = np.argmax(Q[state, :]) # select policy according Q table given by value iternation
    policy[state, best_action] = 1

policy = [int(np.argwhere(policy[i]==1)) for i in range(env.nS) ]
print(policy)

[3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 3, 3, 1, 1, 1, 1, 1, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 1]


test?

In [7]:
num_episode = 1000 # test 1000 times
def test(env,policy):
    
    rewards = []  # rewards for all rounds
    success = []  # whether the round is successful to the end
    for i_ep in range(num_episode):
        ep_reward = 0  # the reward for each episode
        state = env.reset()  # reset environment -> restart a new episode, state=0
        while True:
            action = policy[state]  # choose an action
            next_state, reward, done, _ = env.step(action)  # interact with the environment
            state = next_state  # update state
            ep_reward += reward
            if done:
                break
        if state==47: # reach the end
            success.append(1)
        else:
            success.append(0)
        rewards.append(ep_reward)
    acc_suc = np.array(success).sum()/num_episode
    print("The successful rate of the test：", acc_suc)
    

In [8]:
test(env,policy)

The successful rate of the test： 0.603
