In [0]:
#Implementing policy evaluation and policy iteration based
#on David Silver's RL Lecture Series Part 3

import numpy as np

In [0]:
def calc_value_function(env,state,V,discount):

  #This function will calculate the value function using one step lookahead

  action_values = np.zeros(env.nA)

  for action in range(env.nA):

    #Iterating over and applying the Bellman Equation with one step lookahead.
    for prob, next_state,reward,terminated in env.P[state][action]:
      action_values[action] += prob*(reward + discount*V[next_state])
      

  return action_values


In [0]:
def policy_evaluation(policy,env,discount,cf = 1e-9,max_iterations = 1e9):
  
  iteration = 1

  V = np.zeros(env.nS)

  for i in range(int(max_iterations)):

    delta = 0

    for state in range(env.nS):

      v = 0

      for action,action_prob in enumerate(policy[state]):
        for state_prob,next_state,reward,terminated in env.P[state][action]:

          v+=action_prob*state_prob*(reward + discount*V[next_state])

      delta = max(delta,abs(V[state]-v))

      V[state] = v

      iteration+=1

      if (delta<cf):
        print("Policy evaluated")

        return V

In [0]:
def policy_iteration(env,discount,max_iteraions = 1e9):
  

  policy = np.ones((env.nS,env,nA)) / env.nA

  evaluated_policies = 1

  for i in range(int(max_iterations)):

    stable_policy = True

    V = policy_evaluation(policy,env,discount)

    for state in range(env.nS):

      current_action = np.argmax(policy[state])

      action_values = calc_value_function(env,state,V,discount)

      best_action = np.argmax(action_values)

      if(current_action != best_action):
        stable_policy = False

      policy[state] = np.eye(env.nA)[best_action]

  evaluated_policies +=1


  if(stable_policy):

    return policy, V

In [0]:
def value_iteration(env,discount,cf = 1e-9,max_iterations = 1e9):

  V = np.zeros(env.nS)
  
  for i in range(max_iterations):

    delta = 0 

    for state in range(env.nS):

      action_values = calc_value_function(env,state,V,discount)

      best_action_value = np.max(action_values)

      delta = max(delta, abs(V[state] - best_action_value))

      V[state] = best_action_value

    if(delta<cf):
      break

    
  policy = np.zeros((env.nS,env.nA))

  for state in range(env.nS):
    
     action_values = calc_value_function(env,state,V,discount)

     best_action = np.max(action_values)

     policy[state][action] = 1.0


  return policy,V
