In [1]:
# TD(0) Frozen Lake with explotaiton and exploration

In [2]:
import gymnasium as gym
import numpy as np
import session_info

In [3]:
alpha = 0.8  # Learning rate
gamma = 0.95  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 10000

In [4]:
env = gym.make('FrozenLake-v1', is_slippery=False, render_mode='ansi')

# Initialize Q-table
Q = np.zeros([env.observation_space.n, env.action_space.n])

# TD(0) algorithm
for episode in range(num_episodes):
    state, _ = env.reset()
    state = int(state)
    done = False
    
    while not done:
        # Choose action using epsilon-greedy policy
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])
        
        # Take action and observe next state and reward
        next_state, reward, terminated, truncated, _ = env.step(action)
        next_state = int(next_state)
        done = terminated or truncated
        
        # Modify reward to encourage exploration
        if done and not terminated:
            reward = -1
        elif terminated and reward == 0:
            reward = -1
        
        # TD(0) update
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
        
        state = next_state

    # Decay epsilon 
    epsilon = max(0.01, epsilon * 0.995)

In [13]:
# Testing the results

# Print learned policy
print("Learned Policy:")
print("===============")
print()
policy = np.argmax(Q, axis=1)
policy_symbols = ['←', '↓', '→', '↑', 'S', 'G', 'H']  # Up, Right, Down, Left

# Mark special positions
desc = env.unwrapped.desc
for i in range(4):
    for j in range(4):
        if desc[i][j] == b'H':
            policy[i*4+j%4] = 6
        elif desc[i][j] == b'G':
            policy[i*4+j%4] = 5
        elif desc[i][j] == b'S':
            policy[i*4+j%4] = 4

for i in range(4):
    for j in range(4):
        print(policy_symbols[policy[i*4 + j]], end=' ')
    print()
  
    

# Print value function
print("\nValue Function:")
print("===================")
V = np.max(Q, axis=1)
for i in range(4):
    for j in range(4):
        print(f"{V[i*4 + j]:.2f}", end=' ')
    print()


# Test the learned policy
env = gym.make('FrozenLake-v1', is_slippery=False, render_mode='human')
state, _ = env.reset()
state = int(state)
done = False
total_reward = 0

while not done:
    action = np.argmax(Q[state, :])
    state, reward, terminated, truncated, _ = env.step(action)
    state = int(state)
    done = terminated or truncated
    total_reward += reward
    env.render()

print(f"\nTotal reward: {total_reward}")

Learned Policy:

S → ↓ ← 
↓ H ↓ H 
→ ↓ ← H 
H → → G 

Value Function:
0.77 0.74 0.77 0.59 
0.81 0.00 0.81 0.00 
0.86 0.90 0.86 0.00 
0.00 0.95 1.00 0.00 

Total reward: 1.0


In [6]:
session_info.show(html=False)

-----
gymnasium           0.29.1
numpy               1.26.4
session_info        1.0.0
-----
IPython             8.26.0
jupyter_client      8.6.2
jupyter_core        5.7.2
-----
Python 3.12.3 (main, Sep 11 2024, 14:17:37) [GCC 13.2.0]
Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.39
-----
Session information updated at 2024-09-19 08:40
