In [None]:
import gymnasium as gym
from gymnasium.envs.registration import register
import mujoco
from tqdm import tqdm
import torch
import math

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback

In [None]:
register(
    id="galaxea_r1Pro",
    entry_point="galaxea_r1Pro:Galaxea_r1Pro",
)


In [None]:
# 仿真交互环境
train_env = gym.make("galaxea_r1Pro")
obs, _ = train_env.reset()
print(f"obs space: {train_env.observation_space.shape}, action space: {train_env.action_space.shape}")


log_dir = "./tb_log/"

# 评估环境
eval_env = gym.make("galaxea_r1Pro")   # 你的环境

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=log_dir+"best_model",  # 自动保存最优模型的目录
    log_path=log_dir,                        # 保存评估日志
    eval_freq=50_000,                          # 每 1 万步评估一次
    n_eval_episodes=5,                         # 每次评估 5 个 episode
    deterministic=True,                        # 评估时用确定性策略
    render=False
)


obs space: (706,), action space: (24,)


In [4]:

# 自定义SAC网络结构
# obs space: (706,), action space: (24,)
policy_kwargs = dict(
    net_arch=dict(pi=[128, 64], qf=[256, 64]), # 每个隐藏层的神经元数量，也可以写成 [400, 300] 等
    activation_fn=torch.nn.ReLU  # 激活函数，可改为 torch.nn.Tanh
)

def warm_sin_lr(progress_remaining: float) -> float:
    """
    progress_remaining: 1 -> 0
    假设总共训练T步：
      - 前10% steps: 线性从 1e-5 升到 3e-4 (warm-up)
      - 之后: 按正弦方式从 3e-4 降到 1e-5
    """
    lr_min = 5e-5   
    lr_max = 1e-4
    warm_ratio = 0.05  # 10% warm-up

    # progress_remaining=1 -> step=0; progress_remaining=0 -> step=end
    progress_done = 1.0 - progress_remaining

    if progress_done < warm_ratio:
        # warm-up: 线性上升
        return lr_min + (lr_max - lr_min) * (progress_done / warm_ratio)
    else:
        # sin下降：这里重新归一化到[0,1]
        x = (progress_done - warm_ratio) / (1 - warm_ratio)
        return lr_min + (lr_max - lr_min) * math.sin((1 - x) * math.pi / 2)

model = SAC(
    "MlpPolicy",
    train_env,
    verbose=1,
    learning_rate=warm_sin_lr,
    buffer_size=1_000_000,      # 经验回放缓冲区大小. 这个参数PPO没有
    batch_size=256,             # 默认256
    tau=0.005,                  # 软更新系数
    gamma=0.99,                 # 折扣因子
    train_freq=1,               # 每步都训练，采集多少个环境步的数据后训练一次
    gradient_steps=1,           # 对replayBuffer中读取到的batch，进行多少次梯度下降更新
    tensorboard_log=log_dir,   # 日志目录
    policy_kwargs=policy_kwargs,  # 将自定义结构传进去
)

# 训练模型, total_timesteps自行调整
model.learn(total_timesteps=240000, 
            tb_log_name="sac", 
            progress_bar=True,
            callback=eval_callback)
# 保存模型
model.save("galaxea_sac_lr_forward")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tb_log/sac_6


----------------------------------
| rollout/           |           |
|    ep_len_mean     | 67        |
|    ep_rew_mean     | -2.35e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 253       |
|    time_elapsed    | 1         |
|    total_timesteps | 268       |
| train/             |           |
|    actor_loss      | 40.5      |
|    critic_loss     | 1.28e+03  |
|    ent_coef        | 1         |
|    ent_coef_loss   | 0.0412    |
|    learning_rate   | 5.11e-05  |
|    n_updates       | 167       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 66.5      |
|    ep_rew_mean     | -2.07e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 225       |
|    time_elapsed    | 2         |
|    total_timesteps | 532       |
| train/             |           |
|    actor_loss      | 37.4      |
|    critic_loss    

### 测试模型可视化

In [None]:
# 使用可视化界面记录显示SAC测试结果
# 加载模型
model = SAC.load("galaxea_sac_lr_forward")
# 创建测试环境
visual_env = gym.make("galaxea_r1Pro", render_mode="human")

for i in range(5):
    # 测试模型
    obs, info = visual_env.reset()
    cum_reward = 0
    for _ in tqdm(range(1500)):
        visual_env.render()
        action, _ = model.predict(obs, deterministic=True)
        next_obs, reward, terminated, truncated, info = visual_env.step(action)
        cum_reward += reward
        if terminated or truncated:
            print("累积奖励: ", cum_reward)
            break
            
        obs = next_obs

visual_env.close()

You are using a GLFW raw input patch. This is not the official GLFW library.


100%|██████████| 1500/1500 [00:18<00:00, 79.41it/s]
100%|██████████| 1500/1500 [00:19<00:00, 77.03it/s]
100%|██████████| 1500/1500 [00:38<00:00, 39.42it/s]
100%|██████████| 1500/1500 [02:39<00:00,  9.39it/s]
100%|██████████| 1500/1500 [04:01<00:00,  6.20it/s] 


### 环境debug

In [None]:
# 可视化环境
train_env = gym.make("galaxea_r1Pro", render_mode="human")
unwrapped_env = train_env.unwrapped

model = unwrapped_env.model  # MjModel

print(f"obs space: {train_env.observation_space.shape}, action space: {train_env.action_space.shape}")
print(f"action range: {train_env.action_space.low} to {train_env.action_space.high}")


print(f"actuator size: {model.nu}, ctrl_size: {unwrapped_env.data.ctrl.shape}")  # actuators and muscles
print(f"obs space: {train_env.observation_space.shape}, action space: {train_env.action_space.shape}")


# 随机采样动作
obs, _ = train_env.reset()
for _ in tqdm(range(1000)):
    train_env.render()
    action = train_env.action_space.sample()
    obs, reward, terminated, truncated, info = train_env.step(action)
    if terminated or truncated:
        obs, _ = train_env.reset()
train_env.close()


In [None]:
train_env = gym.make("galaxea_r1Pro")
obs, _ = train_env.reset()
unwrapped_env = train_env.unwrapped

mj_model = unwrapped_env.model  # MjModel

print(f"qpos size: {mj_model.nq}, qvel size: {mj_model.nv}, num_joints: {mj_model.njnt}")  # 都是旋转关节，所以这一项都相同
print(f"actuator size: {mj_model.nu}, ctrl_size: {unwrapped_env.data.ctrl.shape}")  # actuators and muscle
print(f"body_size: {mj_model.nbody}, body pos size: {unwrapped_env.data.xipos.shape}")  # nbody, 3

# print(f"action range: {env.action_space.low} to {env.action_space.high}")


qpos_idx = 0
for joint_id in range(mj_model.njnt):
    joint_name = mujoco.mj_id2name(mj_model, mujoco.mjtObj.mjOBJ_JOINT, joint_id)
    joint_type = mj_model.jnt_type[joint_id]
    
    # 根据关节类型确定占用的 qpos 数量
    if joint_type == mujoco.mjtJoint.mjJNT_FREE:    # 自由关节：7个qpos (x,y,z,qw,qx,qy,qz)
        for i, coord in enumerate(['x', 'y', 'z', 'qw', 'qx', 'qy', 'qz']):
            print(f"qpos[{qpos_idx:2d}]: {joint_name}_{coord}")
            qpos_idx += 1
    elif joint_type == mujoco.mjtJoint.mjJNT_HINGE:  # 铰链关节：1个qpos
        print(f"qpos[{qpos_idx:2d}]: {joint_name}")
        qpos_idx += 1
    elif joint_type == mujoco.mjtJoint.mjJNT_SLIDE:  # 滑动关节：1个qpos  
        print(f"qpos[{qpos_idx:2d}]: {joint_name}")
        qpos_idx += 1


data  = mujoco.MjData(mj_model)
mujoco.mj_forward(mj_model, data)  # 必须有这一步
pos = data.xipos            # shape = (nbody, 3)
x, y, z = pos[:, 0], pos[:, 1], pos[:, 2]
names = [mujoco.mj_id2name(mj_model, mujoco.mjtObj.mjOBJ_BODY, i)
         for i in range(mj_model.nbody)]

for i, name in enumerate(names):
    print(f"body[{i:2d}]: {name}, pos=({x[i]:.3f}, {y[i]:.3f}, {z[i]:.3f})")


train_env.close()