In [1]:
# gym related import statements.
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) # errors only
import time
import numpy as np

### **Value Iteration Algorithm Implementation**

__For environment `Taxi-v3` we have a model of state transition and reward probabilities available to us i.e. P[s][a] information is present for applying model-based learning approaches.__  

__First, we discuss _Value Iteration Algorithm_ which randomly assigns values to `V(s)` and iteratively updates `Q(s,a)` and `V(s)` values until the convergence of the problem.__

In [2]:
def execute_eps(env, policy, gamma=1.0, render=False):
    obs = env.reset()
    total_reward = 0
    step_ind = 0
    while True:
        if render:
            env.render()
        obs, reward, done, _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_ind * reward)
        step_ind +=1
        if done:
            break
    return total_reward

In [3]:
def eval_policy(env, policy, gamma=1.0,  render=False, n=100):
    score_values = [execute_eps(env, policy, gamma, render) for _ in range(n)]
    return np.mean(score_values)

In [4]:
def determine_policy(env, v, gamma=1.0):
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for next_sr in env.P[s][a]:
                p, s_, r, _ = next_sr
                q_sa[a] += (p * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy

In [5]:
def value_iteration(env, gamma=1.0):
    value = np.zeros(env.nS)
    max_iterations = 10000
    eps = 1e-10
    for i in range(max_iterations):
        prev_v = np.copy(value)
        q_sa = np.zeros((env.nS, env.nA))
        for s in range(env.nS):
            for a in range(env.nA):
                for p, s_, r, _ in env.P[s][a]:
                    q_sa[s][a] += p * (r + gamma * prev_v[s_])
        value = np.max(q_sa, axis=1)
        if (np.sum(np.fabs(prev_v - value)) <= eps):
            print('Problem converged at iteration %d.' % (i + 1))
            break
    return value

In [10]:
gamma = 1.00 # not caring about future rewards
env = gym.make('FrozenLake8x8-v0')
optimal_value_func = value_iteration(env, gamma)
start_time = time.time()
policy = determine_policy(env, optimal_value_func, gamma)
policy_score = eval_policy(env, policy, gamma, False, n=1000)
end_time = time.time()
print("Best Policy Score = %0.2f and Time taken = %4.4f seconds" % (np.mean(policy_score),
end_time - start_time))

Problem converged at iteration 1599.
Best Policy Score = 0.90 and Time taken = 1.3020 seconds


In [11]:
gamma = 0.99 # caring about future rewards
env = gym.make('FrozenLake8x8-v0')
optimal_value_func = value_iteration(env, gamma)
start_time = time.time()
policy = determine_policy(env, optimal_value_func, gamma)
policy_score = eval_policy(env, policy, gamma, False, n=1000)
end_time = time.time()
print("Best Policy Score = %0.2f and Time taken = %4.4f seconds" % (np.mean(policy_score),
end_time - start_time))

Problem converged at iteration 750.
Best Policy Score = 0.40 and Time taken = 1.3463 seconds


In [9]:
gamma = 1.00 # not caring about future rewards
env = gym.make("Taxi-v3")
optimal_value_func = value_iteration(env, gamma)
start_time = time.time()
policy = determine_policy(env, optimal_value_func, gamma)
policy_score = eval_policy(env, policy, gamma, False, n=1000)
end_time = time.time()
print("Best Policy Score = %0.2f and Time taken = %4.4f seconds" % (np.mean(policy_score),
end_time - start_time))

Best Policy Score = -195.38 and Time taken = 2.7493 seconds


In [8]:
gamma = 0.99 # caring about future rewards
env = gym.make("Taxi-v3")
optimal_value_func = value_iteration(env, gamma)
start_time = time.time()
policy = determine_policy(env, optimal_value_func, gamma)
policy_score = eval_policy(env, policy, gamma, False, n=1000)
end_time = time.time()
print("Best Policy Score = %0.2f and Time taken = %4.4f seconds" % (np.mean(policy_score),
end_time - start_time))

Problem converged at iteration 3125.
Best Policy Score = 6.38 and Time taken = 0.2815 seconds
