# 使用 Gymnasium 进行 Q-Learning 训练 FrozenLake

这是一个使用 Q-Learning 算法来训练一个智能体玩 FrozenLake-v1 游戏的示例，并包含对训练后智能体的评估。

## 1. 导入库

In [8]:
import numpy as np
import gymnasium as gym
import random
from tqdm import tqdm

## 2. 环境初始化

我们加载 FrozenLake 环境，并查看其状态空间和动作空间。

In [9]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")

# 观察操作空间和状态空间
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space", env.observation_space)
print("Sample observation", env.observation_space.sample())  # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())  # Take a random action

state_space = env.observation_space.n
action_space = env.action_space.n
print("There are", state_space, "可能的状态")
print("There are", action_space, "可能的动作")

_____OBSERVATION SPACE_____ 

Observation Space Discrete(16)
Sample observation 4

 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 3
There are 16 possible states
There are 4 possible actions


## 3. 参数设定

定义 Q-Learning 算法所需的超参数。

In [10]:
n_training_episodes = 100000
max_steps = 100        # 每个episode最多步数
learning_rate = 0.1
gamma = 0.99

# Epsilon-Greedy 策略的参数
min_epsilon = 0.05
max_epsilon = 1.0
decay_rate = 0.0005

## 4. 辅助函数

这里定义了初始化 Q 表、贪心策略和 Epsilon-Greedy 策略的函数。

In [11]:
def initialize_q_table(state_space, action_space):
    """初始化Q表为全零"""
    Qtable = np.zeros((state_space, action_space))
    return Qtable

def greedy_policy(Qtable, state):
    """在利用时，选择具有最高Q值的动作"""
    action = np.argmax(Qtable[state][:])
    return action

def epsilon_greedy_policy(Qtable, state, epsilon):
    """在探索和利用之间进行权衡"""
    random_num = random.uniform(0, 1)
    if random_num > epsilon:
        # 利用 (Exploitation)
        action = greedy_policy(Qtable, state)
    else:
        # 探索 (Exploration)
        action = env.action_space.sample()
    return action

## 5. Q-Learning 训练函数

In [12]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    """训练Q表"""
    for episode in tqdm(range(n_training_episodes)):
        # 在每个 episode 开始时，衰减 epsilon
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        
        # 重置环境
        state, info = env.reset()
        terminated = False
        truncated = False

        for step in range(max_steps):
            # 1. 选择动作
            action = epsilon_greedy_policy(Qtable, state, epsilon)

            # 2. 在环境中执行动作
            new_state, reward, terminated, truncated, info = env.step(action)

            # 3. 使用贝尔曼方程更新Q表
            Qtable[state][action] = Qtable[state][action] + learning_rate * (
                reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action]
            )

            # 如果 episode 结束，则中断循环
            if terminated or truncated:
                break
            
            # 更新状态
            state = new_state
            
    return Qtable

## 6. 开始训练

现在我们使用上面定义的函数来初始化Q表并开始训练过程。

In [13]:
# 初始化Q表
Qtable_frozenlake = initialize_q_table(state_space, action_space)

# 训练Q表
Qtable_frozenlake = train(
    n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake
)

100%|██████████| 100000/100000 [00:12<00:00, 8099.76it/s]


## 7. 查看训练后的Q表

In [14]:
print("训练后的Q表：")
print(Qtable_frozenlake)

训练后的Q表：
[[0.94148015 0.95099005 0.93206535 0.94148015]
 [0.94148015 0.         0.9226185  0.93205744]
 [0.93205181 0.7716989  0.42714575 0.73494114]
 [0.71697927 0.         0.19915692 0.24927304]
 [0.95099005 0.96059601 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.97932987 0.         0.64056064]
 [0.         0.         0.         0.        ]
 [0.96059601 0.         0.970299   0.95099005]
 [0.96059601 0.9801     0.9801     0.        ]
 [0.97027784 0.99       0.         0.96603371]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.99       0.970299  ]
 [0.9801     0.99       1.         0.9801    ]
 [0.         0.         0.         0.        ]]


## 8. 评估智能体 (Evaluate the Agent)

训练完成后，我们定义一个函数来评估智能体的表现。这个过程不再进行探索（没有 epsilon-greedy），而是完全采用贪心策略（greedy policy），即在每个状态下都选择Q值最高的动作。

In [15]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
    """评估智能体在 n_eval_episodes 次游戏中的平均表现"""
    episode_rewards = []
    for episode in tqdm(range(n_eval_episodes)):
        if seed:
            state, info = env.reset(seed=seed[episode])
        else:
            state, info = env.reset()
        step = 0
        truncated = False
        terminated = False
        total_rewards_ep = 0

        for step in range(max_steps):
            # 完全根据Q表来选择最优动作 (Greedy policy)
            action = greedy_policy(Q, state)
            new_state, reward, terminated, truncated, info = env.step(action)
            total_rewards_ep += reward

            if terminated or truncated:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

## 9. 运行评估

设置评估参数并运行评估函数。平均奖励（Mean_reward）为1.0代表智能体在每次评估中都能成功到达终点。

In [16]:
# 设置评估参数
n_eval_episodes = 1000  # 进行1000次评估
# 为了评估的可复现性，我们使用固定的种子
# 您也可以设置为 eval_seed = None 来进行完全随机的评估
eval_seed = [i for i in range(n_eval_episodes)]

# 评估智能体并打印结果
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f"\nMean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

100%|██████████| 1000/1000 [00:00<00:00, 7609.34it/s]


Mean_reward=1.00 +/- 0.00



