In [1]:
# import required libraries
import gymnasium as gym
import numpy as np

# set seed
SEED = 106

In [2]:
# Initialize the Frozen Lake Environment
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True, render_mode='ansi')

In [3]:
env.reset(seed=SEED)
print(env.render())


[41mS[0mFFF
FHFH
FFFH
HFFG



In [4]:
def value_iteration(env, num_of_iterations = 100, gamma = 1.0, threshold = 1e-40):
    """
    Value iteration algorithm to compute the value table
    :param env: environment for an agent
    :param num_of_iterations: number of iterations
    :param gamma: discount factor
    :param threshold: threshold value to stop iterations
    :return: value table
    """
    # Get no. of state and actions in an environment
    num_of_states = env.observation_space.n
    num_of_actions = env.action_space.n
    
    print('Number of states: ', num_of_states)
    print('Number of actions: ', num_of_actions)
    
    # Initialize value table with zero for each state
    value_table = np.zeros(num_of_states)
    
    # Perform value iteration for num_of_iterations
    for i in range(num_of_iterations):
        updated_value_table = np.copy(value_table)
        
        # Compute q value for each state
        for state in range(num_of_states):
            
            # initialize q values
            q_values = []
            
            # for each action in the state, compute q value
            for action in range(num_of_actions):
                
                # Initialize q value to 0
                q_value = 0    
                for prob, next_state, reward, _ in env.unwrapped.P[state][action]:
                    # Compute bellman backup
                    bellman_backup = reward + gamma * updated_value_table[next_state]
                    
                    # Obtain q value
                    q_value += prob * bellman_backup
                 
                # append q value to the list of q values   
                q_values.append(q_value)
                
            value_table[state] = max(q_values)
            
        # Check for convergence
        if np.sum(np.fabs(updated_value_table - value_table)) <= threshold:
            print("Exeuction halted in iteration {} ".format(i))
            break
                
    return value_table


In [5]:
# Print Value Table
optimal_value_table = value_iteration(env, num_of_iterations = 100)
print(optimal_value_table)

Number of states:  16
Number of actions:  4
[0.74419029 0.71786905 0.69921264 0.68954284 0.74998193 0.
 0.47290225 0.         0.7611395  0.7768436  0.72358054 0.
 0.         0.84920568 0.9239777  0.        ]


In [6]:
def extract_policy(env, value_table, gamma = 1.0):
    # Get no. of state and actions in an environment
    num_of_states = env.observation_space.n
    num_of_actions = env.action_space.n
    
    policy = np.zeros(num_of_states)
    
    for state in range(num_of_states):
        q_values = []
        
        for action in range(num_of_actions):
            q_value = 0
            for prob, next_state, reward, _ in env.unwrapped.P[state][action]:
                # print(f"Next State: {next_state}")
                # print(f"Reward: {reward}")
                # print(f"probability: {prob}")
                # Compute bellman backup
                bellman_backup = reward + gamma * value_table[next_state]
                
                # Obtain q value
                q_value += prob * bellman_backup
            
            # append q value to the list of q values
            q_values.append(q_value)
        
        policy[state] = np.argmax(np.array(q_values))
        
    return policy

In [7]:
optimal_policy = extract_policy(env, optimal_value_table)
print(optimal_policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]
