In [1]:
import gymnasium as gym
from stable_baselines3 import PPO, SAC
import imageio
import torch
import math
import numpy as np
from tqdm import tqdm
import mujoco


In [2]:
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.weight = 1
        self.target_orientation = np.array([1.0, 0.0, 0.0, 0.0])  # w, x, y, z

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        # ------------------------
        # 在这里修改 reward
        # 1️⃣ torso orientation (w, x, y, z) 在 obs[1:5]
        torso_ori = obs[1:5]

        # 2️⃣ 计算四元数偏离程度（L2 范数）
        ori_error = np.linalg.norm(torso_ori - self.target_orientation)

        # 3️⃣ 将偏差转成惩罚（越接近目标，惩罚越小）
        ori_reward = -self.weight * ori_error
        
        # 保持脑袋高度
        upreward = self.env.unwrapped.data.xipos[1][2] - 0.9
        # 鼓励抬脚
        footreward = 0
        if  0.2 <= self.env.unwrapped.data.xipos[6][2] <= 0.6:
            footreward += 1
        if  0.2 <= self.env.unwrapped.data.xipos[9][2] <= 0.6:
            footreward += 1

        # 4️⃣ 累加到原始 reward
        wa = 5
        wb = 2
        wc = 5
        new_reward = reward + wa * upreward + wb*footreward - wc*ori_reward
        return obs, new_reward, terminated, truncated, info


In [3]:
# 创建 Ant 环境l
env = gym.make('Humanoid-v5')
env = CustomRewardWrapper(env)
print(f"obs space: {env.observation_space}, action space: {env.action_space}")

obs space: Box(-inf, inf, (348,), float64), action space: Box(-0.4, 0.4, (17,), float32)


In [4]:
unwrapped_env = env.unwrapped

mj_model = unwrapped_env.model  # MjModel

print(f"qpos size: {mj_model.nq}, qvel size: {mj_model.nv}, num_joints: {mj_model.njnt}")  # 都是旋转关节，所以这一项都相同
print(f"actuator size: {mj_model.nu}, ctrl_size: {unwrapped_env.data.ctrl.shape}")  # actuators and muscle
print(f"body_size: {mj_model.nbody}, body pos size: {unwrapped_env.data.xipos.shape}")  # nbody, 3

# print(f"action range: {env.action_space.low} to {env.action_space.high}")


qpos_idx = 0
for joint_id in range(mj_model.njnt):
    joint_name = mujoco.mj_id2name(mj_model, mujoco.mjtObj.mjOBJ_JOINT, joint_id)
    joint_type = mj_model.jnt_type[joint_id]
    
    # 根据关节类型确定占用的 qpos 数量
    if joint_type == mujoco.mjtJoint.mjJNT_FREE:    # 自由关节：7个qpos (x,y,z,qw,qx,qy,qz)
        for i, coord in enumerate(['x', 'y', 'z', 'qw', 'qx', 'qy', 'qz']):
            print(f"qpos[{qpos_idx:2d}]: {joint_name}_{coord}")
            qpos_idx += 1
    elif joint_type == mujoco.mjtJoint.mjJNT_HINGE:  # 铰链关节：1个qpos
        print(f"qpos[{qpos_idx:2d}]: {joint_name}")
        qpos_idx += 1
    elif joint_type == mujoco.mjtJoint.mjJNT_SLIDE:  # 滑动关节：1个qpos  
        print(f"qpos[{qpos_idx:2d}]: {joint_name}")
        qpos_idx += 1


data  = mujoco.MjData(mj_model)
mujoco.mj_forward(mj_model, data)  # 必须有这一步
pos = data.xipos            # shape = (nbody, 3)
x, y, z = pos[:, 0], pos[:, 1], pos[:, 2]
names = [mujoco.mj_id2name(mj_model, mujoco.mjtObj.mjOBJ_BODY, i)
         for i in range(mj_model.nbody)]

for i, name in enumerate(names):
    print(f"body[{i:2d}]: {name}, pos=({x[i]:.3f}, {y[i]:.3f}, {z[i]:.3f})")


env.close()

qpos size: 24, qvel size: 23, num_joints: 18
actuator size: 17, ctrl_size: (17,)
body_size: 14, body pos size: (14, 3)
qpos[ 0]: root_x
qpos[ 1]: root_y
qpos[ 2]: root_z
qpos[ 3]: root_qw
qpos[ 4]: root_qx
qpos[ 5]: root_qy
qpos[ 6]: root_qz
qpos[ 7]: abdomen_z
qpos[ 8]: abdomen_y
qpos[ 9]: abdomen_x
qpos[10]: right_hip_x
qpos[11]: right_hip_z
qpos[12]: right_hip_y
qpos[13]: right_knee
qpos[14]: left_hip_x
qpos[15]: left_hip_z
qpos[16]: left_hip_y
qpos[17]: left_knee
qpos[18]: right_shoulder1
qpos[19]: right_shoulder2
qpos[20]: right_elbow
qpos[21]: left_shoulder1
qpos[22]: left_shoulder2
qpos[23]: left_elbow
body[ 0]: world, pos=(0.000, 0.000, 0.000)
body[ 1]: torso, pos=(-0.003, 0.000, 1.435)
body[ 2]: lwaist, pos=(-0.010, 0.000, 1.140)
body[ 3]: pelvis, pos=(-0.029, 0.000, 0.975)
body[ 4]: right_thigh, pos=(-0.008, -0.095, 0.765)
body[ 5]: right_shin, pos=(-0.005, -0.090, 0.382)
body[ 6]: right_foot, pos=(-0.003, -0.090, 0.182)
body[ 7]: left_thigh, pos=(-0.008, 0.095, 0.765)
body[ 

In [None]:
model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=1_000_000,      # 经验回放缓冲区大小. 这个参数PPO没有
    batch_size=256,             # 默认256
    tau=0.005,                  # 软更新系数
    gamma=0.99,                 # 折扣因子
    train_freq=1,               # 每步都训练，采集多少个环境步的数据后训练一次
    gradient_steps=1,           # 对replayBuffer中读取到的batch，进行多少次梯度下降更新
    tensorboard_log="./tb_log/",   # 日志目录
)

# 训练模型, total_timesteps自行调整
model.learn(total_timesteps=24000, tb_log_name="sac", progress_bar=True )
# 保存模型
model.save("humanoid_sac_upward")

In [4]:
policy_kwargs = dict(
    net_arch=dict(pi=[128, 64], qf=[256, 64]), # 每个隐藏层的神经元数量，也可以写成 [400, 300] 等
    activation_fn=torch.nn.ReLU  # 激活函数，可改为 torch.nn.Tanh
)

def warm_sin_lr(progress_remaining: float) -> float:
    """
    progress_remaining: 1 -> 0
    假设总共训练T步：
      - 前10% steps: 线性从 1e-5 升到 3e-4 (warm-up)
      - 之后: 按正弦方式从 3e-4 降到 1e-5
    """
    lr_min = 5e-5   
    lr_max = 1e-3
    warm_ratio = 0.05  # 10% warm-up

    # progress_remaining=1 -> step=0; progress_remaining=0 -> step=end
    progress_done = 1.0 - progress_remaining

    if progress_done < warm_ratio:
        # warm-up: 线性上升
        return lr_min + (lr_max - lr_min) * (progress_done / warm_ratio)
    else:
        # sin下降：这里重新归一化到[0,1]
        x = (progress_done - warm_ratio) / (1 - warm_ratio)
        return lr_min + (lr_max - lr_min) * math.sin((1 - x) * math.pi / 2)


model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate= warm_sin_lr,  # 2e-4,
    buffer_size=1_000_000,      # 经验回放缓冲区大小. 这个参数PPO没有
    batch_size=256,             # 默认256
    tau=0.005,                  # 软更新系数
    gamma=0.99,                 # 折扣因子
    train_freq=1,               # 每步都训练，采集多少个环境步的数据后训练一次
    gradient_steps=2,           # 对replayBuffer中读取到的batch，进行多少次梯度下降更新
    tensorboard_log="./tb_log/",   # 日志目录
    policy_kwargs=policy_kwargs,  # 将自定义结构传进去
)

# 训练模型, total_timesteps自行调整
model.learn(total_timesteps=240000, tb_log_name="sac_lr", progress_bar=True )
# 保存模型
model.save("humanoid_sac_deeper_lr_up")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tb_log/sac_lr_5


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24.2     |
|    ep_rew_mean     | 198      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 2524     |
|    time_elapsed    | 0        |
|    total_timesteps | 97       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.5     |
|    ep_rew_mean     | 183      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 156      |
|    time_elapsed    | 1        |
|    total_timesteps | 180      |
| train/             |          |
|    actor_loss      | -18.9    |
|    critic_loss     | 95.7     |
|    ent_coef        | 0.989    |
|    ent_coef_loss   | -0.28    |
|    learning_rate   | 6.42e-05 |
|    n_updates       | 158      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_me

### 测试模型效果

In [None]:
print(gym.__file__)

In [4]:
# 使用可视化界面记录显示SAC测试结果
# 加载模型
model = SAC.load("humanoid_sac")
# 创建测试环境
env = gym.make("Humanoid-v5", render_mode="human")

for i in range(5):
    # 测试模型
    state, info = env.reset()
    cum_reward = 0
    for _ in tqdm(range(1500)):
        env.render()
        action, _ = model.predict(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        cum_reward += reward
        if terminated or truncated:
            print("累积奖励: ", cum_reward)
            break
            
        state = next_state

env.close()

You are using a GLFW raw input patch. This is not the official GLFW library.


 67%|██████▋   | 999/1500 [00:12<00:06, 81.09it/s] 


累积奖励:  4900.601777551484


 30%|██▉       | 448/1500 [00:05<00:13, 79.15it/s]


累积奖励:  2196.319615474119


 57%|█████▋    | 859/1500 [00:10<00:08, 79.58it/s]


累积奖励:  4211.291475757785


 63%|██████▎   | 941/1500 [00:11<00:06, 79.91it/s]


累积奖励:  4642.733396635265


 34%|███▍      | 515/1500 [00:06<00:12, 78.71it/s]

累积奖励:  2643.336790138943





### 测试代码

In [None]:
# 测试humanoid环境
env = gym.make("Humanoid-v5", render_mode="human")  # human 模式会弹出窗口
state, info = env.reset(seed=0)


for j in range(5):
    reward_sum = 0
    state, info = env.reset()
    for i in range(1000):
        action = env.action_space.sample()  # actor选择动作
        next_state, reward, terminated, truncated, info = env.step(action)
        reward_sum += reward
        env.render()
        if terminated or truncated:
            print("Total reward:", reward_sum)
            break
        else:
            state = next_state


env.close()

In [None]:
!tensorboard --logdir ./ppo_ant_tb/
!tensorboard --logdir ./tb_log/
# then 然后浏览器打开 http://localhost:6006