#### **SARSA in FrozenLake**

SARSA Algorithm in the Frozenlake environment. Give it a try for the 8x8 version.

In [1]:
import numpy as np
import gymnasium as gym
import session_info

In [2]:
# SARSA parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor

epsilon = 0.99           # Epsilon-greedy policy
epsilon_decay_rate = 0.001     
episodes = 1000         # Number of episodes

In [3]:
# Initialize the environment
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=False)

# Initialize Q-table
Q = np.zeros((env.observation_space.n, env.action_space.n))

def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore
    else:
        return np.argmax(Q[state, :])  # Exploit
    
    
def epsilon_greedy_policy(state, epsilon):  # Explotaition vs Exploration
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

# SARSA algorithm

for episode in range(episodes):
    state, _ = env.reset()
    action = choose_action(state)
    done = False
    
    while not done:
        
        action = epsilon_greedy_policy(state, epsilon)
        next_state, reward, done, _, _= env.step(action)
        next_action = choose_action(next_state)
        
        # SARSA update
        Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
        
        state = next_state
        action = next_action
        epsilon = np.exp(-epsilon_decay_rate * episode)



In [4]:
# Test the learned policy
def test_policy(n_episodes=100):
    successes = 0
    for _ in range(n_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = np.argmax(Q[state, :])
            state, reward, done, _ , _ = env.step(action)
            if reward == 1:
                successes += 1
    return successes / n_episodes

success_rate = test_policy()
print(f"Success rate: {success_rate:.2%}")

# Display the learned Q-table
print("\nLearned Q-table:")
print('[ ','←', '↓', '→', '↑', ' ]')
print()
print(Q)



Success rate: 100.00%

Learned Q-table:
[  ← ↓ → ↑  ]

[[1.69143478e-01 2.33681765e-01 1.18940420e-01 1.75001747e-01]
 [1.67225789e-01 0.00000000e+00 5.93156688e-02 4.24106368e-02]
 [3.16348841e-02 2.17739281e-01 2.41274949e-03 1.34675297e-02]
 [2.53368357e-02 0.00000000e+00 7.94541718e-04 0.00000000e+00]
 [2.01265872e-01 2.84909114e-01 0.00000000e+00 1.71897090e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 5.91537498e-01 0.00000000e+00 5.58516201e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.02461473e-01 0.00000000e+00 3.84981240e-01 1.97329478e-01]
 [2.50085189e-01 3.69649308e-01 5.33817060e-01 0.00000000e+00]
 [3.90326265e-01 8.04360681e-01 0.00000000e+00 3.71756304e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.52092325e-01 7.00138656e-01 2.33704670e-01]
 [3.22542619e-01 7.91418239e-01 1.00000000e+00 4.84414489e-01]


In [5]:
# Testing the results

# Print learned policy
print("Learned Policy:")
print("===============")
print()
policy = np.argmax(Q, axis=1)
policy_symbols = ['←', '↓', '→', '↑', 'S', 'G', 'H']  # Up, Right, Down, Left

# Mark special positions
desc = env.unwrapped.desc
for i in range(4):
    for j in range(4):
        if desc[i][j] == b'H':
            policy[i*4+j%4] = 6
        elif desc[i][j] == b'G':
            policy[i*4+j%4] = 5
        elif desc[i][j] == b'S':
            policy[i*4+j%4] = 4

for i in range(4):
    for j in range(4):
        print(policy_symbols[policy[i*4 + j]], end=' ')
    print()
  
    

# Print value function
print("\nValue Function:")
print("===================")
V = np.max(Q, axis=1)
for i in range(4):
    for j in range(4):
        print(f"{V[i*4 + j]:.2f}", end=' ')
    print()

Learned Policy:

S ← ↓ ← 
↓ H ↓ H 
→ → ↓ H 
H → → G 

Value Function:
0.23 0.17 0.22 0.03 
0.28 0.00 0.59 0.00 
0.38 0.53 0.80 0.00 
0.00 0.70 1.00 0.00 


In [6]:
# Test the learned policy
env = gym.make('FrozenLake-v1', is_slippery=False, render_mode='human')
state, _ = env.reset()
state = int(state)
done = False
total_reward = 0

while not done:
    action = np.argmax(Q[state, :])
    state, reward, terminated, truncated, _ = env.step(action)
    state = int(state)
    done = terminated or truncated
    total_reward += reward
    env.render()

print(f"\nTotal reward: {total_reward}")


Total reward: 1.0


In [7]:
session_info.show(html=False)

-----
gymnasium           1.0.0
numpy               1.26.4
session_info        1.0.0
-----
IPython             8.26.0
jupyter_client      8.6.2
jupyter_core        5.7.2
-----
Python 3.12.3 (main, Sep 11 2024, 14:17:37) [GCC 13.2.0]
Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.39
-----
Session information updated at 2024-11-14 19:23
