# Just Stand Up
Train a humanoid to stand up using reinforcement learning.

This notebook sets up the environment and tools, but **you'll need to implement the key logic**.

## 📦 Install dependencies

In [1]:
# !git clone -q https://github.com/ajaykumaar/just_stand_up.git

In [14]:
import sys
sys.path.insert(0, '.')

In [23]:
%mkdir custom_tests
!touch custom_tests/test_env.py
!touch custom_tests/test_agent.py
!touch custom_tests/__init__.py

In [22]:
# !rm -rf ./test_folder_name//

In [25]:
from custom_tests import test_env

In [32]:
!pip install -q dm_control
!pip install -q stable-baselines3
!pip install -q gym

## 🧠 Load the Humanoid environment (from dm_control)

In [34]:
from dm_control import suite

# Load humanoid standing task
env = suite.load(domain_name="humanoid", task_name="stand")

# Explore observation space
obs_spec = env.observation_spec()
print("Observation spec:", obs_spec)

# TODO: Wrap this env to make it compatible with stable-baselines3


Observation spec: OrderedDict([('joint_angles', Array(shape=(21,), dtype=dtype('float64'), name='joint_angles')), ('head_height', Array(shape=(), dtype=dtype('float64'), name='head_height')), ('extremities', Array(shape=(12,), dtype=dtype('float64'), name='extremities')), ('torso_vertical', Array(shape=(3,), dtype=dtype('float64'), name='torso_vertical')), ('com_velocity', Array(shape=(3,), dtype=dtype('float64'), name='com_velocity')), ('velocity', Array(shape=(27,), dtype=dtype('float64'), name='velocity'))])


In [31]:
for obs_cat in list(obs_spec.keys()):
  print(obs_spec[obs_cat])

Array(shape=(21,), dtype=dtype('float64'), name='joint_angles')
Array(shape=(), dtype=dtype('float64'), name='head_height')
Array(shape=(12,), dtype=dtype('float64'), name='extremities')
Array(shape=(3,), dtype=dtype('float64'), name='torso_vertical')
Array(shape=(3,), dtype=dtype('float64'), name='com_velocity')
Array(shape=(27,), dtype=dtype('float64'), name='velocity')


### Visualize

In [2]:
!pip install -q opencv-python
# Step 1: System packages for EGL
!apt-get install -y libosmesa6-dev libgl1-mesa-glx libglfw3

# Step 2: Install mujoco + dm_control
!pip install mujoco==2.3.7
!pip install dm_control


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libdrm-dev libgl-dev libglx-dev libosmesa6 libpciaccess-dev mesa-common-dev
Suggested packages:
  libgles1 libvulkan1
The following NEW packages will be installed:
  libdrm-dev libgl-dev libgl1-mesa-glx libglfw3 libglx-dev libosmesa6 libosmesa6-dev
  libpciaccess-dev mesa-common-dev
0 upgraded, 9 newly installed, 0 to remove and 30 not upgraded.
Need to get 5,850 kB of archives.
After this operation, 18.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpciaccess-dev amd64 0.16-3 [21.9 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libdrm-dev amd64 2.4.113-2~ubuntu0.22.04.1 [292 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libglx-dev amd64 1.4.0-1 [14.1 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 libgl-dev amd64 1.4.0-1 [101 kB]
Get:5

Collecting mujoco>=3.2.7 (from dm_control)
  Using cached mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
Using cached mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Installing collected packages: mujoco
  Attempting uninstall: mujoco
    Found existing installation: mujoco 2.3.7
    Uninstalling mujoco-2.3.7:
      Successfully uninstalled mujoco-2.3.7
Successfully installed mujoco-3.3.0


In [1]:
import os
os.environ["MUJOCO_GL"] = "egl"

from dm_control import suite


In [2]:
env = suite.load(domain_name="humanoid", task_name="stand")
frame = env.physics.render(camera_id=0, height=240, width=320)


In [4]:
import cv2
import numpy as np
from dm_control import suite
from IPython.display import HTML
from base64 import b64encode

# Load the humanoid-stand environment
env = suite.load(domain_name="humanoid", task_name="stand")

# Video writer setup
video_path = "./humanoid_stand.avi"
width, height = 320, 240
fps = 30
out = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*"XVID"), fps, (width, height))

# Reset environment
ts = env.reset()

# Collect frames
for _ in range(150):  # 150 frames ~ 5 seconds
    action = np.random.uniform(low=env.action_spec().minimum,
                               high=env.action_spec().maximum,
                               size=env.action_spec().shape)
    ts = env.step(action)

    frame = env.physics.render(camera_id=0, height=height, width=width)
    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    out.write(frame_bgr)

out.release()


## ⚙️ Define your Gym wrapper
_You'll need to convert `dm_control` into a Gym-compatible environment._

In [21]:
# # !pip uninstall -y gym gymnasium
# !pip install gym


In [2]:
for obs_cat in list(obs_spec.keys()):
  print(obs_spec[obs_cat])

Array(shape=(21,), dtype=dtype('float64'), name='joint_angles')
Array(shape=(), dtype=dtype('float64'), name='head_height')
Array(shape=(12,), dtype=dtype('float64'), name='extremities')
Array(shape=(3,), dtype=dtype('float64'), name='torso_vertical')
Array(shape=(3,), dtype=dtype('float64'), name='com_velocity')
Array(shape=(27,), dtype=dtype('float64'), name='velocity')


In [14]:
21+1+12+3+3+27

67

In [33]:
# Reset the environment and inspect the first observation
time_step = env.reset()
print("Observation space:")
for key, value in time_step.observation.items():
    print(f"{key}: shape={value.shape}, sample={value.ravel()[:5]}...")  # show a few values

# Print action spec
action_spec = env.action_spec()
print("\n Action Space:")
print(f"Shape: {action_spec.shape}, Minimum: {action_spec.minimum}, Maximum: {action_spec.maximum}")


Observation space:
joint_angles: shape=(21,), sample=[ 0.37785738 -0.80576928  0.11845993 -0.42711742 -0.52448537]...
head_height: shape=(), sample=[1.45681715]...
extremities: shape=(12,), sample=[0.09050242 0.61727894 0.46901407 0.36942174 0.35887969]...
torso_vertical: shape=(3,), sample=[-0.27466003  0.93429466 -0.22727816]...
com_velocity: shape=(3,), sample=[0. 0. 0.]...
velocity: shape=(27,), sample=[0. 0. 0. 0. 0.]...

 Action Space:
Shape: (21,), Minimum: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1.], Maximum: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [23]:
gym.spaces.Box(low=-np.inf, high=np.inf, shape=(67,), dtype=np.float64 )

Box(-inf, inf, (67,), float64)

In [24]:
gym.spaces.Box(low=-1.0, high=1.0, shape=(21,), dtype=np.float64)

Box(-1.0, 1.0, (21,), float64)

In [22]:
# !pip install -q gymnasium

In [6]:
from stable_baselines3.common.env_checker import check_env

In [43]:
# TODO: Implement a wrapper that exposes reset(), step(), observation_space, action_space
# You may want to flatten the observation dict into a single np.ndarray

import gymnasium as gym
import numpy as np

class DMCWrapper(gym.Env):
    def __init__(self):
        self.env = suite.load(domain_name="humanoid", task_name="stand")
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(67,), dtype=np.float64)
        self.action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(21,), dtype=np.float32)

    def reset(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        ts = self.env.reset()
        return self._flatten_obs(ts.observation), {}

    def step(self, action):
        ts = self.env.step(action)
        obs = self._flatten_obs(ts.observation)
        reward = ts.reward or 0.0
        terminated = ts.last()
        truncated = False

        return obs, reward, terminated, truncated, {}

    def _flatten_obs(self, obs):
        return np.concatenate([v.ravel() for v in obs.values()])
env = DMCWrapper()
check_env(env)

### test functions env

In [57]:
from stable_baselines3.common.env_checker import check_env

def test_env_initialization(env):
  try:
    check_env(env)
    print("env_checker's Check env passed...")
  except Exception as e:
    print("env_checker's Check env failed!!")
    print(e)

    assert isinstance(env.observation_space, Box), "observation_space is not a gym.spaces.Box"
    assert env.observation_space.shape == (67,), "Unexpected shape for observation_space"

    assert isinstance(env.action_space, Box), "action_space is not a gym.spaces.Box"
    assert env.action_space.shape == (21,), "Unexpected shape for action_space"

    obs, info = env.reset()
    assert isinstance(obs, np.ndarray), "reset() did not return a numpy array"
    assert obs.shape == env.observation_space.shape, "Mismatch in observation shape"
    # assert obs.dtype == np.float32, "Observation dtype should be float32"

    print("All test cases passed!!!\n")

def test_env_step(env):

  obs, info = env.reset()
  action = env.action_space.sample()
  try:
    next_obs, reward, terminated, truncated, info = env.step(action)
  except Exception as e:
    print("env.step failed!!\n")
    print(e)

  assert isinstance(next_obs, np.ndarray), "Returned obs is not a numpy array"
  assert next_obs.shape == env.observation_space.shape, "Observation shape mismatch"
  assert isinstance(reward, float), "Reward should be a float"
  assert isinstance(terminated, bool), "Terminated flag should be boolean"
  assert isinstance(truncated, bool), "Truncated flag should be boolean"
  assert isinstance(info, dict), "Info should be a dict"

  print(f"Action shape: {action.shape}")
  print(f"Observation shape: {next_obs.shape}")
  print(f"Reward: {reward}")


In [58]:
env = DMCWrapper()
test_env_initialization(env)
test_env_step(env)

env_checker's Check env passed...
Action shape: (21,)
Observation shape: (67,)
Reward: 0.5444483962246125


In [20]:
env = DMCWrapper()

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 2.97     |
| time/              |          |
|    fps             | 540      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 4.83       |
| time/                   |            |
|    fps                  | 484        |
|    iterations           | 2          |
|    time_elapsed         | 8          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.02952134 |
|    clip_fraction        | 0.295      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.8

KeyboardInterrupt: 

## 🤖 Train with PPO (optional starter agent in src/agents/ppo_agent.py)

In [None]:
# TODO: Import your wrapped env and train with PPO
# from stable_baselines3 import PPO
# model = PPO('MlpPolicy', your_wrapped_env, verbose=1)
# model.learn(total_timesteps=...)

## ✅ Evaluate your trained policy

In [None]:
# TODO: Load model, run inference loop, visualize behavior (if desired)
# obs = env.reset()
# for _ in range(1000):
#     action, _ = model.predict(obs)
#     obs, reward, done, info = env.step(action)
#     if done:
#         break

## 🧪 PPO Training on DummyEnv
Here's a working example of PPO training on a simple dummy environment.

In [16]:

import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

class DummyEnv(gym.Env):
    def __init__(self):
        super(DummyEnv, self).__init__()
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(10,), dtype=np.float32)
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)

    def reset(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        return self.observation_space.sample(), {}

    def step(self, action):
        obs = self.observation_space.sample()
        reward = np.random.rand()
        terminated = np.random.rand() > 0.95
        truncated = False
        info = {}
        return obs, reward, terminated, truncated, info

env = DummyEnv()
check_env(env)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)
print("✅ PPO training on DummyEnv completed!")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 18.2     |
|    ep_rew_mean     | 9.1      |
| time/              |          |
|    fps             | 1179     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 19          |
|    ep_rew_mean          | 9.63        |
| time/                   |             |
|    fps                  | 855         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007915258 |
|    clip_fraction        | 0.054       |
|    clip_range           | 0.2         |
|    entropy_loss   

KeyboardInterrupt: 