In [1]:
import gymnasium as gym
import numpy as np    
import time

# Helper Functions
### 1.Value Iteration

In [2]:
def value_iteration(env,n_states,n_actions,gamma=0.95):
    start=time.time()
    P=env.unwrapped.P #dynamics of the environment
    Values=np.zeros(n_states) # numpy array to store Value function
    
    threshold=1e-4
    delta=1
    ctr=0
    while (delta>threshold):
        ctr+=1
        delta=0
    
        for state in range(n_states):
            old_v=Values[state]
            max_v=-float('inf')
            for action in range(n_actions):
                v=0
                for prob,next_state,reward,_ in  P[state][action]:
                    v+=prob*(reward+gamma*Values[next_state])
                max_v = max(max_v, v)
             
            Values[state]=max_v
            delta=max(abs(Values[state]-old_v),delta)
    end=time.time()
    print(f"Value Iteration took {ctr} iterations to converge")
    print(f"Value Iteration took {end-start} seconds to converge")

    return Values
        

### 2.Policy Iteration

In [3]:
def policy_iteration(env,n_states,n_actions,gamma=0.95):
    start=time.time()
    
    P=env.unwrapped.P #dynamics of the environment
    Values=np.zeros(n_states) # numpy array to store Value function
    Policy=np.zeros(n_states,dtype=int)

    threshold=1e-4  
    ctr=0
   
    while True:
        ctr+=1
        
        # policy evaluation step     
        delta=float('inf')
        while(delta>threshold):
            
            Values_new=np.zeros(n_states)
            delta=0
            for state in range(n_states):
                action=Policy[state]
                for prob,next_state,reward,_ in  P[state][action]:
                    Values_new[state]+=prob*(reward+gamma*Values[next_state])
            
                delta=max(delta,abs(Values_new[state]-Values[state]))
            Values[:] = Values_new
            
        
        # policy improvement step
        
        stable=True
        for state in range(n_states):
            
            old_action=Policy[state]
            max_v=-float('inf')
            for action in range(n_actions):
                v=0
                for prob,next_state,reward,_ in  P[state][action]:
                    v+=prob*(reward+gamma*Values[next_state])
                if (v>max_v):
                    Policy[state]=action
                    max_v=v
    
            if(old_action!=Policy[state]): stable=False
    
        if (stable):
            break
    end=time.time()
    print(f"Policy Iteration took {ctr} iterations to converge")
    print(f"Policy Iteration took {end-start} seconds to converge")

    return Policy     

### 3. Function to Derive Policy Given the Value Function


In [4]:
def get_policy(env,Values,n_states,n_actions,gamma=0.95):
    
    P=env.unwrapped.P #dynamics of the environment

    Action_values=np.zeros((n_states,n_actions))
    
    for state in range(n_states):
        for action in range(n_actions):
            for prob,next_state,reward,_ in  P[state][action]:
                Action_values[state][action]+=prob*(reward+gamma*Values[next_state])
    
    
    policy=np.zeros(n_states,dtype=int)
    
    
    for state in range(n_states):
        policy[state] = np.argmax(Action_values[state])

    return policy


### 4. Function to test policy by running it 1000 times on the environment

In [5]:
def test(env,policy):
    n_episodes = 1000
    avg_length=0
    avg_reward=0
    for episode in range(n_episodes):
        state = env.reset()[0]
        done = False
        length=0
        reward=0
        while not done:
            length+=1
            action = int(policy[state])
            state, reward, done, _, _ = env.step(action)
        avg_length+=length   
        avg_reward+=reward
    
    avg_length/=n_episodes
    avg_reward/=n_episodes
    print(f"Average episode length :{avg_length}")
    print(f"Average reward per episode :{avg_reward}")


### 5.Function to print policy

In [6]:
def action_to_symbol(action):
    symbols = {
        0: '←',  
        1: '↓',  
        2: '→',  
        3: '↑'   
    }
    return symbols[action]

def print_policy(policy, n_rows, n_cols):
    print("-" * (2 * n_cols))
    for i in range(n_rows):
        row = ''
        for j in range(n_cols):
            state = i * n_cols + j
            row += action_to_symbol(policy[state]) + ' '
        print(row)
    print("-" * (2 * n_cols))

### 1️⃣ Original Frozen Lake Environment


In [7]:
env = gym.make("FrozenLake-v1", render_mode=None)
Values=value_iteration(env,16,4)
Policy1=get_policy(env,Values,16,4)
print_policy(Policy1, 4, 4)

test(env,Policy1)

Policy2 = policy_iteration(env,16,4)
print_policy(Policy2, 4, 4)

test(env,Policy2)

Value Iteration took 57 iterations to converge
Value Iteration took 0.004751682281494141 seconds to converge
--------
← ↑ ← ↑ 
← ← ← ← 
↑ ↓ ← ← 
← → ↓ ← 
--------
Average episode length :45.378
Average reward per episode :0.793
Policy Iteration took 6 iterations to converge
Policy Iteration took 0.004015684127807617 seconds to converge
--------
← ↑ ← ↑ 
← ← ← ← 
↑ ↓ ← ← 
← → ↓ ← 
--------
Average episode length :44.28
Average reward per episode :0.782


In [8]:
from gymnasium.envs.registration import register
from Custom import *

register(
    id='CustomFrozenLake-v0',
    entry_point='Custom:CustomFrozenLakeEnv' 
)

### 2️⃣ Custom Frozen Lake Environment


In [9]:
Custom_env = gym.make('CustomFrozenLake-v0',P=custom1_prob)

Values=value_iteration(Custom_env, 16, 4)
Policy1=get_policy(Custom_env,Values, 16, 4)
print_policy(Policy1, 4, 4)
test(Custom_env,Policy1)

Policy2=policy_iteration(Custom_env,16,4)
print_policy(Policy2, 4, 4)
test(Custom_env,Policy2)

Value Iteration took 57 iterations to converge
Value Iteration took 0.0047681331634521484 seconds to converge
--------
← ↑ ← ↑ 
← ← ← ← 
↑ ↓ ← ← 
← → ↓ ← 
--------
Average episode length :43.306
Average reward per episode :0.772
Policy Iteration took 6 iterations to converge
Policy Iteration took 0.004185199737548828 seconds to converge
--------
← ↑ ← ↑ 
← ← ← ← 
↑ ↓ ← ← 
← → ↓ ← 
--------
Average episode length :44.184
Average reward per episode :0.769


### 3️⃣ Custom Frozen Lake Extended Environment


In [10]:
Custom_env = gym.make('CustomFrozenLake-v0',P=custom2_prob)

Values = value_iteration(Custom_env, 64, 4)
Policy1 = get_policy(Custom_env,Values, 64, 4)
print_policy(Policy1, 8, 8)
test(Custom_env,Policy1)

Policy2 = policy_iteration(Custom_env,64,4)
print_policy(Policy2, 8, 8)
test(Custom_env ,Policy2) 

Value Iteration took 60 iterations to converge
Value Iteration took 0.0206146240234375 seconds to converge
----------------
← ← ← ← ← ↑ → ← 
↓ ↓ ← ↑ ← ← → ↓ 
↓ ↓ ← ← → ↓ ↓ ↑ 
↓ ↓ ↓ ↓ ↓ ↑ ← ← 
↓ ↓ ↓ ↓ ← ← → ↓ 
↓ ↓ → ↑ ↓ ↓ ← ← 
↓ → ← ← → ↓ ↓ ↓ 
↓ → ↓ ↓ → → → ← 
----------------
Average episode length :65.259
Average reward per episode :0.957
Policy Iteration took 12 iterations to converge
Policy Iteration took 0.03955841064453125 seconds to converge
----------------
← ← ← ← ← ↑ → ← 
↓ ↓ ← ↑ ← ← → ↓ 
↓ ↓ ← ← → ↓ ↓ ↑ 
↓ ↓ ↓ ↓ ↓ ↑ ← ← 
↓ ↓ ↓ ↓ ← ← → ↓ 
↓ ↓ → ↑ ↓ ↓ ← ← 
↓ → ← ← → ↓ ↓ ↓ 
↓ → ↓ ↓ → → → ← 
----------------
Average episode length :64.061
Average reward per episode :0.952
