In [1]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import numpy as np
import matplotlib.pyplot as plt
import time

## Helper Functions.


### 1. Function to test a policy

In [2]:
def test(env,policy):
    n_episodes = 1000
    avg_length=0
    avg_reward=0
    n_states=env.observation_space.n
    n_actions=env.action_space.n
    for episode in range(n_episodes):
        state = env.reset()[0]
        done = False
        while not done:
            action = policy[state]
            state, reward, done, _, _ = env.step(action)
            avg_length+=1   
            avg_reward+=reward
    
    avg_length/=n_episodes
    avg_reward/=n_episodes
    print(f"Average episode length :{avg_length}")
    print(f"Average reward per episode :{avg_reward}")


### 2. Function to get policy given the action-value function

In [3]:
def get_policy(Q):
    n_states=Q.shape[0]
    policy=np.zeros(n_states,dtype=int)
    for state in range(n_states):
        policy[state] = np.argmax(Q[state])

    return policy


### 3. Function to take greedy action

In [4]:
def greedy_action(Q_s):
    max_val = Q_s.max()
    candidates = np.flatnonzero(Q_s == max_val)
    return np.random.choice(candidates)

# Algorithms

## 1. Monte Carlo (On-policy first-visit for epsilon soft policies)

In [5]:
def MC(env, max_episodes, gamma=0.95):
    epsilon   = 0.1
    n_states  = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states, n_actions))
    N = np.zeros((n_states, n_actions))

    for ep in range(max_episodes):
        t = 0
        first_visit = np.full((n_states, n_actions), -1)             
        episode  = []
        state = env.reset()[0]
        done = False

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = greedy_action(Q[state])

            next_state, reward, done, _, _ = env.step(action)

            if first_visit[state][action] == -1:
                first_visit[state][action] = t

            episode.append((state, action, reward))
            state = next_state
            t += 1

        G = 0
        T = len(episode)
        for t in range(T-1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward

            if first_visit[state, action] == t:
                N[state, action] += 1
                Q[state, action] += (G - Q[state, action]) / N[state, action]

    return Q


## 2. Sarsa

In [6]:
def Sarsa(env, max_episodes,gamma=0.95):
    alpha_init=0.5
    alpha_end=0.01
    epsilon_init=0.5
    epsilon_end=0.001
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states, n_actions))

    for ep in range(max_episodes):
        
        decay = (max_episodes-ep)/max_episodes
        epsilon = (epsilon_init- epsilon_end)*decay + epsilon_end
        alpha = (alpha_init- alpha_end)*decay + alpha_end
        state = env.reset()[0]
        done = False
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = greedy_action(Q[state])
        
        while not done:
            next_state, reward, done, _, _ = env.step(action)

            if np.random.rand() < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = greedy_action(Q[next_state])
                
            Q[state, action] += alpha * (reward + gamma*Q[next_state, next_action] - Q[state, action])
            state = next_state
            action = next_action

    return Q


## 3. Q-Learning

In [7]:
def Q_learning(env,max_episodes,gamma=0.95):
    alpha_init=0.5
    alpha_end=0.01
    epsilon_init=0.5
    epsilon_end=0.01
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states,n_actions))
    
    
    for ep in range(max_episodes):
        
        decay = (max_episodes-ep)/max_episodes
        epsilon = (epsilon_init- epsilon_end)*decay + epsilon_end
        alpha = (alpha_init- alpha_end)*decay + alpha_end
        state = env.reset()[0]
        done = False
        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = greedy_action(Q[state])
                
            next_state, reward, done, _, _ = env.step(action)
            Q[state,action] += alpha*(reward+gamma*np.max(Q[next_state])-Q[state,action])
            state = next_state
            
    return Q

## 4. Double-Q-Learning

In [8]:
def Double_Q_learning(env, max_episodes, gamma=0.95):
    alpha_init = 0.7
    alpha_end = 0.01
    epsilon_init = 0.7
    epsilon_end = 0.01
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    Q1 = np.zeros((n_states, n_actions))
    Q2 = np.zeros((n_states, n_actions))

    for ep in range(max_episodes):
        decay = (max_episodes - ep) / max_episodes
        epsilon = (epsilon_init - epsilon_end) * decay + epsilon_end
        alpha = (alpha_init - alpha_end) * decay + alpha_end

        state = env.reset()[0]
        done = False

        while not done:
            Q_sum = Q1[state] + Q2[state]
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = greedy_action(Q_sum)

            next_state, reward, done, _, _ = env.step(action)

            if np.random.rand() < 0.5:           
                Q1[state,action] += alpha*(reward+gamma*np.max(Q1[next_state])-Q1[state,action])
            else:
                Q2[state,action] += alpha*(reward+gamma*np.max(Q2[next_state])-Q2[state,action])

            state = next_state

    return Q1 + Q2 


## 5. Expected Sarsa

In [34]:
def Expected_Sarsa(env, max_episodes, gamma=0.95):
    
    epsilon_init=0.5
    epsilon_end=0.01
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states,n_actions))
    
    
    for ep in range(max_episodes):
        decay = (max_episodes-ep)/max_episodes
        epsilon = (epsilon_init- epsilon_end)*decay + epsilon_end
        alpha= 0.5
        state = env.reset()[0]
        done = False
        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = greedy_action(Q[state])
                
            next_state, reward, done, _, _ = env.step(action)
            if done:
                expected_val = 0
            else:
                expected_val = gamma * ((1 - epsilon) * np.max(Q[next_state]) + (epsilon / n_actions) * np.sum(Q[next_state]))
                
            Q[state,action] += alpha*(reward+expected_val-Q[state,action])
            state = next_state
            
    return Q
    

## 6. Policy Iteration

In [10]:
def policy_iteration(env,gamma=0.95):
    start=time.time()
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    P=env.unwrapped.P #dynamics of the environment
    Values=np.zeros(n_states) # numpy array to store Value function
    Policy=np.zeros(n_states,dtype=int)

    threshold=1e-4  
    ctr=0
   
    while True:
        ctr+=1
        
        # policy evaluation step     
        delta=float('inf')
        while(delta>threshold):
            
            Values_new=np.zeros(n_states)
            delta=0
            for state in range(n_states):
                action=Policy[state]
                for prob,next_state,reward,_ in  P[state][action]:
                    Values_new[state]+=prob*(reward+gamma*Values[next_state])
            
                delta=max(delta,abs(Values_new[state]-Values[state]))
            Values[:] = Values_new
            
        
        # policy improvement step
        
        stable=True
        for state in range(n_states):
            
            old_action=Policy[state]
            max_v=-float('inf')
            for action in range(n_actions):
                v=0
                for prob,next_state,reward,_ in  P[state][action]:
                    v+=prob*(reward+gamma*Values[next_state])
                if (v>max_v):
                    Policy[state]=action
                    max_v=v
    
            if(old_action!=Policy[state]): stable=False
    
        if (stable):
            break
    end=time.time()
    print(f"Policy Iteration took {ctr} iterations to converge")
    print(f"Policy Iteration took {end-start} seconds to converge")

    return Policy     

#### Now lets test these algorithms on frozen lake environment-

#### The implementation of the environment is in Custom.py. 
##### I will use the algorithm on two grids:
##### The first grid is 10x10 with slipperiness enabled

In [11]:
from gymnasium.envs.registration import register
from Custom import *

register(
    id='CustomFrozenLake-v0',
    entry_point='Custom:CustomFrozenLakeEnv' 
)
learn_env = gym.make('CustomFrozenLake-v0',P=custom1_prob,test=False)
test_env = gym.make('CustomFrozenLake-v0',P=custom1_prob,test=True)


### Monte carlo

In [31]:
start = time.time()
Q = MC(learn_env,40000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Monte Carlo :{end-start} seconds")
test(test_env,policy)

Time taken by Monte Carlo :24.2641704082489 seconds
Average episode length :63.633
Average reward per episode :0.937


### Sarsa

In [13]:
start = time.time()
Q = Sarsa(learn_env,2000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Sarsa :{end-start} seconds")
test(test_env,policy)


Time taken by Sarsa :1.0861554145812988 seconds
Average episode length :63.699
Average reward per episode :0.953


### Q-Learning

In [14]:
start = time.time()
Q = Q_learning(learn_env,2000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Q-learning :{end-start} seconds")
test(test_env,policy)

Time taken by Q-learning :1.6619250774383545 seconds
Average episode length :64.296
Average reward per episode :1.0


### Double-Q-Learning

In [30]:
start = time.time()
Q = Double_Q_learning(learn_env,2000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Double Q-learning :{end-start} seconds")
test(test_env,policy)

Time taken by Double Q-learning :1.672043800354004 seconds
Average episode length :63.906
Average reward per episode :1.0


### Expected-Sarsa

In [39]:
start = time.time()
Q = Expected_Sarsa(learn_env,2000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Expected-Sarsa :{end-start} seconds")
test(test_env,policy)

Time taken by Expected-Sarsa :1.9502525329589844 seconds
Average episode length :67.814
Average reward per episode :1.0


### Policy Iteration

In [17]:
start = time.time()
policy = policy_iteration(learn_env)
end = time.time()

test(test_env,policy)

Policy Iteration took 14 iterations to converge
Policy Iteration took 0.04755425453186035 seconds to converge
Average episode length :60.57
Average reward per episode :1.0


### This is the second grid 50x50 with slipperniess enabled

In [18]:
learn_env2 = gym.make('CustomFrozenLake-v0',P=custom2_prob,test=False)
test_env2 = gym.make('CustomFrozenLake-v0',P=custom2_prob,test=True)

### Sarsa

In [19]:
start = time.time()
Q = Sarsa(learn_env2,140000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Sarsa :{end-start} seconds")
test(test_env2,policy)


Time taken by Sarsa :163.49777555465698 seconds
Average episode length :357.736
Average reward per episode :0.633


### Q-learning

In [20]:
start = time.time()
Q = Q_learning(learn_env2,100000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Q-learning :{end-start} seconds")
test(test_env2,policy)

Time taken by Q-learning :119.02345752716064 seconds
Average episode length :360.623
Average reward per episode :0.662


### Double Q-Learning

In [21]:
start = time.time()
Q = Double_Q_learning(learn_env2,140000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Double Q-learning :{end-start} seconds")
test(test_env2,policy)

Time taken by Double Q-learning :161.83349299430847 seconds
Average episode length :363.975
Average reward per episode :0.583


### Expected Sarsa

In [35]:
start = time.time()
Q = Expected_Sarsa(learn_env2,90000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Expected-Sarsa :{end-start} seconds")
test(test_env2,policy)

Time taken by Expected-Sarsa :109.16084361076355 seconds
Average episode length :474.402
Average reward per episode :0.614


### Policy iteration

In [23]:
start = time.time()
policy = policy_iteration(learn_env2)
end = time.time()

test(test_env2,policy)

Policy Iteration took 69 iterations to converge
Policy Iteration took 1.9005193710327148 seconds to converge
Average episode length :324.466
Average reward per episode :0.626


# Bonus Task(Cliff walking)
#### Lets test these algorithms on the cliff walk environment

The implementation of the environment is in cliff.py

In [24]:
from gymnasium.envs.registration import register
from cliff import *

register(
    id='CustomCliff-v0',
    entry_point='cliff:CustomCliffEnv' 
)
cliff_env = gym.make("CustomCliff-v0",P=custom_prob)

### Sarsa

In [25]:
start = time.time()
Q = Sarsa(cliff_env,1000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Sarsa :{end-start} seconds")
test(cliff_env,policy)

Time taken by Sarsa :0.6421310901641846 seconds
Average episode length :17.0
Average reward per episode :-17.0


### Q-learning

In [26]:
start = time.time()
Q = Q_learning(cliff_env,1000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Q-learning :{end-start} seconds")
test(cliff_env,policy)

Time taken by Q-learning :0.5027506351470947 seconds
Average episode length :13.0
Average reward per episode :-13.0


### Double-Q-Learning

In [27]:
start = time.time()
Q = Double_Q_learning(cliff_env,1000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Double Q-learning :{end-start} seconds")
test(cliff_env,policy)

Time taken by Double Q-learning :0.4177422523498535 seconds
Average episode length :13.0
Average reward per episode :-13.0


### Expected Sarsa

In [28]:
start = time.time()
Q = Expected_Sarsa(cliff_env,1000)
end = time.time()
policy = get_policy(Q)

print(F"Time taken by Expected Sarsa :{end-start} seconds")
test(cliff_env,policy)

Time taken by Expected Sarsa :0.8062534332275391 seconds
Average episode length :17.0
Average reward per episode :-17.0


### Policy Iteration

In [29]:
start = time.time()
policy = policy_iteration(cliff_env)
end = time.time()

test(cliff_env,policy)

Policy Iteration took 15 iterations to converge
Policy Iteration took 0.007189273834228516 seconds to converge
Average episode length :13.0
Average reward per episode :-13.0
