In [1]:
import gymnasium as gym
import numpy as np    


# Helper Functions
### 1.Value Iteration

In [2]:

def value_iteration(env,n_states,n_actions,gamma=0.95):
    
    P=env.unwrapped.P #dynamics of the environment
    Values=np.zeros(n_states) # numpy array to store Value function
    
    threshold=1e-8
    delta=1
    ctr=0
    while (delta>threshold):
        ctr+=1
        delta=0
    
        for state in range(n_states):
            old_v=Values[state]
            max_v=-float('inf')
            for action in range(n_actions):
                v=0
                for prob,next_state,reward,_ in  P[state][action]:
                    v+=prob*(reward+gamma*Values[next_state])
                max_v = max(max_v, v)
             
            Values[state]=max_v
            delta=max(abs(Values[state]-old_v),delta)

    print(f"Value Iteration took {ctr} iterations to converge")
    return Values
        

### 2.Policy Iteration

In [13]:
def policy_iteration(env,n_states,n_actions,gamma=0.95):
    P=env.unwrapped.P #dynamics of the environment
    Values=np.zeros(n_states) # numpy array to store Value function
    Policy=np.zeros(n_states,dtype=int)

    threshold=1e-8  
    ctr=0
   
    while True:
         
        delta=float('inf')
        # policy evaluation step
        
        while(delta>threshold):
            ctr+=1
            Values_new=np.zeros(n_states)
            delta=0
            for state in range(n_states):
                action=Policy[state]
                for prob,next_state,reward,_ in  P[state][action]:
                    Values_new[state]+=prob*(reward+gamma*Values[next_state])
            
                delta=max(delta,abs(Values_new[state]-Values[state]))
            Values[:] = Values_new
            
        
        # policy improvement step
        stable=True
        for state in range(n_states):
            
            old_action=Policy[state]
            max_v=-float('inf')
            for action in range(n_actions):
                v=0
                for prob,next_state,reward,_ in  P[state][action]:
                    v+=prob*(reward+gamma*Values[next_state])
                if (v>max_v):
                    Policy[state]=action
                    max_v=v
    
            if(old_action!=Policy[state]): stable=False
    
        if (stable):
            break
    print(f"Policy Iteration took {ctr} iterations to converge")
    return Policy
            

### 3. Function to Derive Policy Given the Value Function


In [4]:
def get_policy(env,Values,n_states,n_actions,gamma=0.95):
    
    P=env.unwrapped.P #dynamics of the environment

    Action_values=np.zeros((n_states,n_actions))
    
    for state in range(n_states):
        for action in range(n_actions):
            for prob,next_state,reward,_ in  P[state][action]:
                Action_values[state][action]+=prob*(reward+gamma*Values[next_state])
    
    
    policy=np.zeros(n_states,dtype=int)
    
    
    for state in range(n_states):
        policy[state] = np.argmax(Action_values[state])

    return policy


### 4. Function to test policy by running it 1000 times on the environment

In [5]:
def test(env,policy):
    n_episodes = 1000
    avg_length=0
    avg_reward=0
    for episode in range(n_episodes):
        state = env.reset()
        
        if isinstance(state, tuple):  # Gym returns (obs, info)
            state = state[0]
        done = False
        length=0
        reward=0
        while not done:
            length+=1
            action = int(policy[state])
            state, reward, done, _, _ = env.step(action)
        avg_length+=length   
        avg_reward+=reward
    
    avg_length/=n_episodes
    avg_reward/=n_episodes
    print(f"Average episode length :{avg_length}")
    print(f"Average reward per episode :{avg_reward}")


### 1️⃣ Original Frozen Lake Environment


In [14]:
env = gym.make("FrozenLake-v1", render_mode=None)
Values=value_iteration(env,16,4)
Policy1=get_policy(env,Values,16,4)
print(Policy1)

test(env,Policy1)

Policy2=policy_iteration(env,16,4)
print(Policy2)

test(env,Policy2)

Value Iteration took 138 iterations to converge
[0 3 0 3 0 0 0 0 3 1 0 0 0 2 1 0]
Average episode length :43.016
Average reward per episode :0.781
Policy Iteration took 533 iterations to converge
[0 3 0 3 0 0 0 0 3 1 0 0 0 2 1 0]
Average episode length :43.628
Average reward per episode :0.792


### 2️⃣ Custom Frozen Lake Environment


In [None]:
import Custom

Custom_env=Custom.CustomFrozenLakeEnv(Custom.P)

Values=train(Custom_env,16,4)
Policy=get_policy(Custom_env,Values,16,4)

test(Custom_env,Policy)
