In [1]:
import numpy as np
import gym
import time
from lake_envs import *

In [2]:
env = gym.make("Deterministic-4x4-FrozenLake-v0")
np.set_printoptions(precision=3)

  result = entry_point.load(False)


In [3]:
def policy_evaluation(P, nS, nA, policy, gamma=0.9, tol=1e-3):
	"""Evaluate the value function from a given policy.

	Parameters
	----------
	P, nS, nA, gamma:
		defined at beginning of file
	policy: np.array[nS]
		The policy to evaluate. Maps states to actions.
	tol: float
		Terminate policy evaluation when
			max |value_function(s) - prev_value_function(s)| < tol
	Returns
	-------
	value_function: np.ndarray[nS]
		The value function of the given policy, where value_function[s] is
		the value of state s
	"""

	value_function = np.zeros(nS)

	############################
	# YOUR IMPLEMENTATION HERE #
	#print(policy)
	while True:
		delta = 0
		#for all state...
		for each_State in range(nS) :
			# v <- V(s)
			value_state_x = value_function[each_State]

			# V(s) <- Bellman Equation of state action value
			action_value = np.zeros(nA)
			tmp_s_a_val = 0
			#print(sum_sr_2(P, value_function, each_State, policy[each_State], gamma))
			for t_p, next_state, reward_t, _ in P[each_State][policy[each_State]] :
				tmp_s_a_val += t_p*(reward_t+(gamma*value_function[next_state]))
			value_function[each_State] = tmp_s_a_val
			#print(tmp_s_a_val)
			#print(action_value)
			# delta <- max |value_function(s) - prev_value_function(s)| < tol
			delta = max(delta, abs(value_state_x-value_function[each_State]))
			#print("delta_tol = ",delta, "  vs ", tol)
		# Terminate policy evaluation when delta < tol
		if delta < tol:
			break

	############################
	return value_function

In [4]:
gamma=0.9
tol=1e-3
P = env.P
nS = env.nS
nA = env.nA

In [5]:
value_function = np.zeros(nS)
policy = np.zeros(nS, dtype=int)

In [6]:
policy_evaluation(P, nS, nA, policy, gamma, tol)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [7]:
def policy_improvement(P, nS, nA, value_from_policy, policy, gamma=0.9):
	"""Given the value function from policy improve the policy.

	Parameters
	----------
	P, nS, nA, gamma:
		defined at beginning of file
	value_from_policy: np.ndarray
		The value calculated from the policy
	policy: np.array
		The previous policy.

	Returns
	-------
	new_policy: np.ndarray[nS]
		An array of integers. Each integer is the optimal action to take
		in that state according to the environment dynamics and the
		given value function.
	"""

	new_policy = np.zeros(nS, dtype='int')

	############################
	# YOUR IMPLEMENTATION HERE #

	# For each State
	for each_state in range(nS) :
		# V(s) <- Bellman Equation of state action value
		list_of_action = []
		for each_action in range(nA) :
			tmp_s_a_val = 0
			for t_p, next_state, reward_t, _ in P[each_state][each_action] :
				tmp_s_a_val += t_p*(reward_t+(gamma*value_from_policy[next_state]))
			list_of_action.append(tmp_s_a_val)
		# argmax of the action-value
		new_policy[each_state] = np.argmax(list_of_action)

	############################
	return new_policy

In [8]:
policy_improvement(P, nS, nA, value_function, policy, gamma)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0])

In [9]:
def policy_iteration(P, nS, nA, gamma=0.9, tol=10e-3):
	"""Runs policy iteration.

	You should call the policy_evaluation() and policy_improvement() methods to
	implement this method.

	Parameters
	----------
	P, nS, nA, gamma:
		defined at beginning of file
	tol: float
		tol parameter used in policy_evaluation()
	Returns:
	----------
	value_function: np.ndarray[nS]
	policy: np.ndarray[nS]
	"""

	value_function = np.zeros(nS)
	policy = np.zeros(nS, dtype=int)
	iter_n = 0
	############################
	# YOUR IMPLEMENTATION HERE #
	while True :
		iter_n += 1
		value_function = policy_evaluation(P, nS, nA, policy, gamma, tol)
        
		old_policy = policy
		policy = policy_improvement(P, nS, nA, value_function, policy, gamma)
        
		break_con = True
		for each_state in range(nS) :
			if old_policy[each_state] != policy[each_state] :
				break_con = False
        
		if break_con :
			break
            
	print(iter_n)
	############################
	return value_function, policy

In [10]:
policy_iteration(env.P, env.nS, env.nA, gamma=0.9, tol=1e-3)

7


(array([0.59 , 0.656, 0.729, 0.656, 0.656, 0.   , 0.81 , 0.   , 0.729,
        0.81 , 0.9  , 0.   , 0.   , 0.9  , 1.   , 0.   ]),
 array([1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]))