In [1]:
import numpy as np
import sys,os
import torch
import random
curr_path = os.path.abspath('')
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path)
from rl.envs.simple_grid import DrunkenWalkEnv

## 1. Set Environment

In [2]:
## fix the random seed
def all_seed(env,seed = 1):
    env.seed(seed) 
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) 

In [4]:
env = DrunkenWalkEnv(map_name="theAlley")
all_seed(env, seed = 1)  ## random seed is 1

## 2. Value Iteration Algorithm

In [27]:
def value_iteration(env, theta=0.005, discount_factor=0.7):
    Q = np.zeros((env.nS, env.nA)) # initialize value table Q
    count = 0
    while True:
        delta = 0.0
        Q_tmp = np.zeros((env.nS, env.nA))
        for state in range(env.nS):
            for a in range(env.nA):
                accum = 0.0
                reward_total = 0.0
                for prob, next_state, reward, done in env.P[state][a]:
                    accum += prob* np.max(Q[next_state, :])
                    reward_total += prob * reward # reward is also an expection, which indicates how much reward you will receive when you arrive at some state.
                Q_tmp[state, a] = reward_total + discount_factor * accum
                delta = max(delta, abs(Q_tmp[state, a] - Q[state, a]))
        Q = Q_tmp
        count += 1
        if delta < theta or count > 1000:
            break 
    return Q

In [24]:
print(env.nS,env.nA)

13 4


In [28]:
Q = value_iteration(env)
print(Q.shape)
print(Q)

(13, 4)
[[1.39599386e+143 1.57049310e+143 2.79198773e+143 1.57049310e+143]
 [1.74499233e+143 3.35911024e+143 6.28197239e+143 3.35911024e+143]
 [3.92623274e+143 7.55799803e+143 1.41344379e+144 7.55799803e+143]
 [8.83402367e+143 1.70054956e+144 3.18024852e+144 1.70054956e+144]
 [1.98765533e+144 3.57777959e+144 7.15555918e+144 3.57777959e+144]
 [4.47222449e+144 8.60903213e+144 1.61000081e+145 8.60903213e+144]
 [1.00625051e+145 1.93703223e+145 3.62250183e+145 1.93703223e+145]
 [2.26406365e+145 4.35832252e+145 8.15062912e+145 4.35832252e+145]
 [5.09414320e+145 9.16945777e+145 1.83389155e+146 9.16945777e+145]
 [1.14618222e+146 2.20640077e+146 4.12625599e+146 2.20640077e+146]
 [2.57891000e+146 4.96440174e+146 9.28407599e+146 4.96440174e+146]
 [5.80254749e+146 1.11699039e+147 2.08891710e+147 1.11699039e+147]
 [3.65560492e+147 4.56950615e+147 4.70006347e+147 4.56950615e+147]]


In [29]:
policy = np.zeros([env.nS, env.nA]) 
for state in range(env.nS):
    best_action = np.argmax(Q[state, :]) 
    policy[state, best_action] = 1

policy = [int(np.argwhere(policy[i]==1)) for i in range(env.nS) ]
print(policy)

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


## 3. Test

In [48]:
num_episode = 2000 
def test(env,policy):
    
    rewards = []  
    success = []  
    for i_ep in range(num_episode):
        ep_reward = 0  
        state = env.reset() 
        while True:
            action = policy[state]
            next_state, reward, done, _ = env.step(action)
            state = next_state
            ep_reward += reward
            if done:
                break
        if state==12: 
            success.append(1)
        else:
            success.append(0)
        rewards.append(ep_reward)
    acc_suc = np.array(success).sum()/num_episode
    print("accuracy is", acc_suc)

In [51]:
test(env, policy)

accuracy is 0.6575
