In [1]:
import gym

In [2]:
gym.envs.register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.74
)

#### Value Iteration

In [61]:
# intializing a arbitrary value function
import copy
import numpy as np
import json

no_states  = 16
no_actions = 4
Vpi_s      = {}

for i in range(no_states):
    Vpi_s[i] = 0

In [62]:
# intializing a random policy function
Policy = {}

for i in range(16):
    Policy[i] = [0.25, 0.25, 0.25, 0.25]

In [1]:
def truncated_policy_evaluation(discount, threshold, env_model):
    '''
    performs one sweep of truncated policy evaluation for value iteration
    '''
    global Vpi_s, Policy
    
    discount   = discount   # discount factor
    threshold  = threshold  # threshold for terminating policy evaluation
    iterations = 0          # no. of iterations it took to converge
    converged  = False      # flag to exit while loop when converged
    env_model  = env_model  # model of the environment i.e. transition probabilities
    no_actions = 4

    while not converged:
        iterations += 1
        max_diff = 0
        for state in Vpi_s.keys():
            cur_value    = copy.copy(Vpi_s[state])
        
            qpi_list = []                     # contains q(s,a) for every action
            for action in range(no_actions):
                reward       = env_model[state][action][0][2]
                next_state   = env_model[state][action][0][1]
                trans_prob   = env_model[state][action][0][0]            
                qpi_list.append(trans_prob*(reward + discount*Vpi_s[next_state]))
                
            Vpi_s[state] = max(qpi_list)            
            max_diff = max(max_diff, abs(cur_value - Vpi_s[state]))
        
        if max_diff<threshold: 
            converged = True 
            
    return iterations
            
def policy_improvement(discount, env_model):
    '''
    performs one sweep of policy improvement
    '''
    global Vpi_s, Policy
    
    converged  = True
    no_actions = 4
    
    for state in Vpi_s.keys():
        cur_stateaction = copy.copy(Policy[state])   # current qpi(s,a)
        
        qpi_list = []                     # contains q(s,a) for every action
        for action in range(no_actions):
            reward       = env_model[state][action][0][2]
            next_state   = env_model[state][action][0][1]
            trans_prob   = env_model[state][action][0][0]
            qpi_list.append(trans_prob*(reward + discount*Vpi_s[next_state]))
            
        maxa_list      = np.argwhere(qpi_list == np.amax(qpi_list))
        maxa_list_indx = []
        
        # indices that have max. q values
        for item in maxa_list:
            maxa_list_indx.append(item[0])
            
        # updating the policy
        for i in range(no_actions):
            if i in maxa_list_indx:
                Policy[state][i] = 1/len(maxa_list_indx)
            else:
                Policy[state][i] = 0 
                
        if Policy[state]!=cur_stateaction:
            converged = False
            
    return converged    

In [64]:
# Value Iteration Loop
p_iterations = 0        # no of policy iteration steps
eval_iter    = 0        # total no of evaluation iterations
discount     = 0.9
threshold    = 0.0001

while True:
    eval_steps   = truncated_policy_evaluation(discount, threshold, env.env.P)
    converged    = policy_improvement(discount, env.env.P)
    p_iterations += 1    
    eval_iter    += eval_steps
    
    if converged:
        break 
    

In [65]:
print('Final Policy: ', json.dumps(Policy, indent=3))
print('Final State-Value Function: ', json.dumps(Vpi_s, indent=3))
print('Total number of value iteration steps: ', p_iterations)
print('Total number of policy evaluation steps: ', eval_iter)

Final Policy:  {
   "0": [
      0,
      0.5,
      0.5,
      0
   ],
   "1": [
      0,
      0,
      1.0,
      0
   ],
   "2": [
      0,
      1.0,
      0,
      0
   ],
   "3": [
      1.0,
      0,
      0,
      0
   ],
   "4": [
      0,
      1.0,
      0,
      0
   ],
   "5": [
      0.25,
      0.25,
      0.25,
      0.25
   ],
   "6": [
      0,
      1.0,
      0,
      0
   ],
   "7": [
      0.25,
      0.25,
      0.25,
      0.25
   ],
   "8": [
      0,
      0,
      1.0,
      0
   ],
   "9": [
      0,
      0.5,
      0.5,
      0
   ],
   "10": [
      0,
      1.0,
      0,
      0
   ],
   "11": [
      0.25,
      0.25,
      0.25,
      0.25
   ],
   "12": [
      0.25,
      0.25,
      0.25,
      0.25
   ],
   "13": [
      0,
      0,
      1.0,
      0
   ],
   "14": [
      0,
      0,
      1.0,
      0
   ],
   "15": [
      0.25,
      0.25,
      0.25,
      0.25
   ]
}
Final State-Value Function:  {
   "0": 0.5904900000000002,
   "1": 0.65610