[View in Colaboratory](https://colab.research.google.com/github/Yashwardhankaul/move37/blob/master/valueIteration.ipynb)

In [2]:
import numpy
import time 
!pip install gym

Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/9b/50/ed4a03d2be47ffd043be2ee514f329ce45d98a30fe2d1b9c61dea5a9d861/gym-0.10.5.tar.gz (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 6.9MB/s 
Collecting pyglet>=1.2.0 (from gym)
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 14.2MB/s 
Building wheels for collected packages: gym
  Running setup.py bdist_wheel for gym ... [?25l- \ | / done
[?25h  Stored in directory: /root/.cache/pip/wheels/cb/14/71/f4ab006b1e6ff75c2b54985c2f98d0644fffe9c1dddc670925
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.10.5 pyglet-1.3.2


In [0]:
import gym

In [0]:
def execute(env, policy, episodeLength=100, render=False):
  totalReward = 0
  start = env.reset()
  for t in range(episodeLength):
    if render:
      env.render()
    action = policy[start]
    start, reward, done, _= env.step(action)
    totalReward += reward
    if done:
      break
  return totalReward

In [0]:
def evaluatePolicy(env, policy, n_episodes=100):
  totalReward = 0.0
  for _ in range(n_episodes):
    totalReward += execute(env, policy)
  return totalReward/ n_episodes

In [0]:
def gen_random_policy():
  return numpy.random.choice(4, size=((16)))

In [10]:
env = gym.make('FrozenLake-v0')
n_policies = 1000
startTime = time.time()
policy_set = [gen_random_policy() for _ in range(n_policies)]
policy_score= [evaluatePolicy(env, p) for p in policy_set]
endTime = time.time()
print("Best Score = %0.2f. Time taken = %4.4f seconds"%(numpy.max(policy_score),endTime-startTime))

Best Score = 0.70. Time taken = 12.2057 seconds


Now using Value Iteration

In [0]:
def execute(env,policy,gamma=1.0,render=False):
  start=env.reset()
  totalReward = 0
  stepIndex = 0
  while True:
    if render:
      env.render()
    start,reward,done,_=env.step(int(policy[start]))
    totalReward += (gamma**stepIndex*reward)
    stepIndex += 1
    if done:
      break
    return totalReward

In [0]:
def evaluatePolicy(env, policy, gamma=1.0, n=100):
  scores=[execute(env,policy,gamma=gamma,render=False) for _ in range(n)]
  return numpy.mean(scores)

In [0]:
def calculatePolicy(v, gamma = 1.0):
  policy = numpy.zeros(env.env.nS)
  for s in range(env.env.nS):
    q_sa = numpy.zeros(env.action_space.n)
    for a in range(env.action_space.n):
      for next_sr in env.env.P[s][a]:
        p,s_,r,_ = next_sr
        q_sa[a] += (p * (r + gamma * v[s_]))
    policy[s] = numpy.argmax(q_sa)
  return policy

In [0]:
def valueIteration(env, gamma = 1.0):
  value = numpy.zeros(env.env.nS)
  max_iterations=10000
  eps = 1e-20
  for i in range(max_iterations):
    prev_v = numpy.copy(value)
    for s in range(env.env.nS):
      q_sa = [sum([p*(r + prev_v[s_]) for p,s_,r,_ in env.env.P[s][a]]) for a in range(env.env.nA)]
      value[s] = max(q_sa)
    if (numpy.sum(numpy.fabs(prev_v - value)) <= eps):
      print("Value-iteration converged at # %d"%(i + 1))
      break
  return value

In [24]:
gamma = 1.0
env= gym.make("FrozenLake-v0")
optimalValue = valueIteration(env,gamma)
startTime = time.time()
policy = calculatePolicy(optimalValue, gamma)
policy_score = evaluatePolicy(env,policy,gamma,n=1000)
endTime= time.time()
print(policy_score)
print("Best score = %0.2f. Time taken = %4.4f seconds"%(numpy.mean(policy_score),endTime-startTime))

Value-iteration converged at # 1373
0.0
Best score = 0.00. Time taken = 0.0219 seconds
