In [1]:
import gymnasium as gym
from stable_baselines3 import PPO, SAC
import imageio
import torch
import math


In [2]:
# 创建 Ant 环境l
env = gym.make("Ant-v4")

  logger.deprecation(


In [3]:
# 这里定义网络的隐藏层结构
policy_kwargs = dict(  
    activation_fn=torch.nn.ReLU,   # 这里替换激活函数,只影响隐藏层，不影响输出层
    net_arch=[
        dict(pi=[128, 64], vf=[256, 64])  # pi=actor, vf=critic
    ]
)


def warm_sin_lr(progress_remaining: float) -> float:
    """
    progress_remaining: 1 -> 0
    假设总共训练T步：
      - 前10% steps: 线性从 1e-5 升到 3e-4 (warm-up)
      - 之后: 按正弦方式从 3e-4 降到 1e-5
    """
    lr_min = 5e-4   # 1e-3, 1e-4取得了不错的效果
    lr_max = 8e-4
    warm_ratio = 0.1  # 10% warm-up

    # progress_remaining=1 -> step=0; progress_remaining=0 -> step=end
    progress_done = 1.0 - progress_remaining

    if progress_done < warm_ratio:
        # warm-up: 线性上升
        return lr_min + (lr_max - lr_min) * (progress_done / warm_ratio)
    else:
        # sin下降：这里重新归一化到[0,1]
        x = (progress_done - warm_ratio) / (1 - warm_ratio)
        return lr_min + (lr_max - lr_min) * math.sin((1 - x) * math.pi / 2)


# 使用 Stable-Baselines3 的 PPO 算法
model = PPO(
    "MlpPolicy",  # 多层感知机作为策略网络
    env,
    verbose=1,
    learning_rate=warm_sin_lr,   # 关键：传入函数
    # learning_rate=0.002,
    n_steps=2048,   # 每次rollout采集2048步数据
    batch_size=128,
    n_epochs=10,     # 对每个采集到的样本进行10次优化。 PPO中多次利用样本，提高数据利用率。注意，因为同一批数据取自同一个策略，所以始终限制策略更新在一个一致的old policy附近
    gamma=0.99,
    gae_lambda=0.95,   # GAE参数，控制偏差与方差的权衡
    tensorboard_log="./ant_tb/",   # TensorBoard 日志目录
    policy_kwargs=policy_kwargs,  # 关键
)

# 训练模型, total_timesteps自行调整
model.learn(total_timesteps=800000, tb_log_name="ppo")
# 保存模型
model.save("ppo_ant")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Logging to ./ant_tb/ppo_8




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 43.9     |
|    ep_rew_mean     | -46.9    |
| time/              |          |
|    fps             | 1413     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 85.4       |
|    ep_rew_mean          | -91.7      |
| time/                   |            |
|    fps                  | 1252       |
|    iterations           | 2          |
|    time_elapsed         | 3          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.02257391 |
|    clip_fraction        | 0.218      |
|    clip_range           | 0.2        |
|    entropy_loss         | -11.3      |
|    explained_variance   | -0.00605   |
|    learning_rate        | 0.000508   |
|   

In [None]:
model2 = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=3e-4,
    buffer_size=1_000_000,      # 经验回放缓冲区大小. 这个参数PPO没有
    batch_size=256,             # 默认256
    tau=0.005,                  # 软更新系数
    gamma=0.99,                 # 折扣因子
    train_freq=1,               # 每步都训练，采集多少个环境步的数据后训练一次
    gradient_steps=1,           # 对replayBuffer中读取到的batch，进行多少次梯度下降更新
    tensorboard_log="./ant_tb/",   # 日志目录
)

# 训练模型, total_timesteps自行调整
model2.learn(total_timesteps=2400000, tb_log_name="sac")
# 保存模型
model2.save("sac_ant_long")

In [None]:
!tensorboard --logdir ./ppo_ant_tb/

# then 然后浏览器打开 http://localhost:6006

In [4]:

# 使用可视化界面记录显示PP0测试结果
# 加载模型
model = PPO.load("ppo_ant")
# 创建测试环境
env = gym.make("Ant-v4", render_mode="human")

for i in range(5):
    # 测试模型
    state, info = env.reset()
    cum_reward = 0
    for _ in range(1500):
        env.render()
        action, _ = model.predict(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        cum_reward += reward
        if terminated or truncated:
            print("累积奖励: ", cum_reward)
            break
            
        state = next_state

env.close()

  logger.deprecation(


You are using a GLFW raw input patch. This is not the official GLFW library.
累积奖励:  39.624696465469015
累积奖励:  1097.2048343639306
累积奖励:  1083.8343837859636
累积奖励:  1150.8448155390915
累积奖励:  972.775234888472


In [5]:

# 使用可视化界面记录显示SAC测试结果
# 加载模型
model = SAC.load("sac_ant")
# 创建测试环境
env = gym.make("Ant-v4", render_mode="human")

for i in range(5):
    # 测试模型
    state, info = env.reset()
    cum_reward = 0
    for _ in range(1500):
        env.render()
        action, _ = model.predict(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        cum_reward += reward
        if terminated or truncated:
            print("累积奖励: ", cum_reward)
            break
            
        state = next_state

env.close()

You are using a GLFW raw input patch. This is not the official GLFW library.
累积奖励:  1864.8707884031992
累积奖励:  1381.6634528642587
累积奖励:  1418.9972541725515
累积奖励:  1260.7382623742226
累积奖励:  1927.6211943472051


In [None]:
# 加载模型
model = PPO.load("ppo_ant")
# 创建测试环境
env = gym.make("Ant-v4", render_mode="rgb_array")
# 存储每帧图像
frames = []
# 测试模型
obs, info = env.reset()
for _ in range(1000):
    env.render()
    frames.append(env.render())  # 捕获帧
    action, _ = model.predict(obs)
    next_state, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        obs, info = env.reset()
    else:
        obs = next_state

env.close()

# 保存为视频
imageio.mimsave("./ppo_ant_video.mp4", frames, fps=30)