#### **Q-Learning on the Frozenlake environment**

It uses the standard 4x4 map. Would you dare to try it with the 8x8 map?

In [1]:
import numpy as np
import gymnasium as gym
import session_info

In [2]:
# Q-Learning parameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
max_steps = 1000

epsilon = 0.9  # Epsilon-greedy policy
epsilon_decay_rate = 0.00001
episodes = 1000  # Number of episodes

In [3]:
def epsilon_greedy_policy(state, epsilon):  # Explotaition vs Exploration
    if np.random.random() < epsilon:
        return env.action_space.sample()    # explore
    else:
        return np.argmax(Q[state])          # exploit

In [4]:
# Initialize the environment
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=False)

# Initialize Q-table
Q = np.zeros((env.observation_space.n, env.action_space.n))



In [6]:
# Q-Learning algorithm - Commens changes from SARSA example
for episode in range(episodes):
    state, _ = env.reset()
    done = False
    for step in range(max_steps):
        
        action = epsilon_greedy_policy(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        
        # Q-learning update
        # The key difference is using max Q-value of next state instead of Q-value of next action
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
        
        if done:
            break
        state = next_state
        epsilon = np.exp(-epsilon_decay_rate * episode)
        # Removed: action = next_action (Q-learning is off-policy, so we don't need to track the next action)

In [7]:
# Test the learned policy
def test_policy(n_episodes=100):
    successes = 0
    for _ in range(n_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = np.argmax(Q[state, :])
            state, reward, done, _ , _ = env.step(action)
            if reward == 1:
                successes += 1
    return successes / n_episodes

success_rate = test_policy()
print(f"Success rate: {success_rate:.2%}")

# Display the learned Q-table
print("\nLearned Q-table:")
print('[ ','←', '↓', '→', '↑', ' ]')
print()
print(Q)


Success rate: 100.00%

Learned Q-table:
[  ← ↓ → ↑  ]

[[0.26905916 0.28255367 0.2401106  0.26766058]
 [0.25196846 0.         0.22118874 0.21359538]
 [0.19911552 0.24045566 0.13940258 0.21053942]
 [0.17687729 0.         0.12428172 0.13050084]
 [0.26460905 0.29633447 0.         0.24934925]
 [0.         0.         0.         0.        ]
 [0.         0.27705892 0.         0.16447255]
 [0.         0.         0.         0.        ]
 [0.28106746 0.         0.31584442 0.18620199]
 [0.24055002 0.34348287 0.29498591 0.        ]
 [0.17922417 0.41806973 0.         0.17567777]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.26245495 0.47963341 0.22456076]
 [0.31438153 0.42754372 0.65132156 0.15766483]
 [0.         0.         0.         0.        ]]


In [8]:
# Testing the results

# Print learned policy
print("Learned Policy:")
print("===============")
print()
policy = np.argmax(Q, axis=1)
policy_symbols = ['←', '↓', '→', '↑', 'S', 'G', 'H']  # Up, Right, Down, Left

# Mark special positions
desc = env.unwrapped.desc
for i in range(4):
    for j in range(4):
        if desc[i][j] == b'H':
            policy[i*4+j%4] = 6
        elif desc[i][j] == b'G':
            policy[i*4+j%4] = 5
        elif desc[i][j] == b'S':
            policy[i*4+j%4] = 4

for i in range(4):
    for j in range(4):
        print(policy_symbols[policy[i*4 + j]], end=' ')
    print()
  
    

# Print value function
print("\nValue Function:")
print("===================")
V = np.max(Q, axis=1)
for i in range(4):
    for j in range(4):
        print(f"{V[i*4 + j]:.2f}", end=' ')
    print()


# Test the learned policy
env = gym.make('FrozenLake-v1', is_slippery=False, render_mode='human')
state, _ = env.reset()
state = int(state)
done = False
total_reward = 0

while not done:
    action = np.argmax(Q[state, :])
    state, reward, terminated, truncated, _ = env.step(action)
    state = int(state)
    done = terminated or truncated
    total_reward += reward
    env.render()

print(f"\nTotal reward: {total_reward}")

Learned Policy:

S ← ↓ ← 
↓ H ↓ H 
→ ↓ ↓ H 
H → → G 

Value Function:
0.28 0.25 0.24 0.18 
0.30 0.00 0.28 0.00 
0.32 0.34 0.42 0.00 
0.00 0.48 0.65 0.00 

Total reward: 1.0


In [None]:
session_info.show(html=False)