# Just Stand Up
Train a humanoid to stand up using reinforcement learning.

This notebook sets up the environment and tools, but **you'll need to implement the key logic**.

## 📦 Install dependencies

In [None]:
!pip install dm_control
!pip install stable-baselines3
!pip install gym

## 🧠 Load the Humanoid environment (from dm_control)

In [None]:
from dm_control import suite

# Load humanoid standing task
env = suite.load(domain_name="humanoid", task_name="stand")

# Explore observation space
obs_spec = env.observation_spec()
print("Observation spec:", obs_spec)

# TODO: Wrap this env to make it compatible with stable-baselines3


## ⚙️ Define your Gym wrapper
_You'll need to convert `dm_control` into a Gym-compatible environment._

In [None]:
# TODO: Implement a wrapper that exposes reset(), step(), observation_space, action_space
# You may want to flatten the observation dict into a single np.ndarray

import gym
import numpy as np

class DMCWrapper(gym.Env):
    def __init__(self):
        self.env = suite.load(domain_name="humanoid", task_name="stand")
        self.observation_space = ...  # TODO
        self.action_space = ...       # TODO

    def reset(self):
        ts = self.env.reset()
        return self._flatten_obs(ts.observation)

    def step(self, action):
        ts = self.env.step(action)
        obs = self._flatten_obs(ts.observation)
        reward = ts.reward or 0.0
        done = ts.last()
        return obs, reward, done, {}

    def _flatten_obs(self, obs):
        return np.concatenate([v.ravel() for v in obs.values()])


## 🤖 Train with PPO (optional starter agent in src/agents/ppo_agent.py)

In [None]:
# TODO: Import your wrapped env and train with PPO
# from stable_baselines3 import PPO
# model = PPO('MlpPolicy', your_wrapped_env, verbose=1)
# model.learn(total_timesteps=...)

## ✅ Evaluate your trained policy

In [None]:
# TODO: Load model, run inference loop, visualize behavior (if desired)
# obs = env.reset()
# for _ in range(1000):
#     action, _ = model.predict(obs)
#     obs, reward, done, info = env.step(action)
#     if done:
#         break

## 🧪 PPO Training on DummyEnv
Here's a working example of PPO training on a simple dummy environment.

In [None]:

import gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

class DummyEnv(gym.Env):
    def __init__(self):
        super(DummyEnv, self).__init__()
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(10,), dtype=np.float32)
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)

    def reset(self):
        return self.observation_space.sample()

    def step(self, action):
        obs = self.observation_space.sample()
        reward = np.random.rand()
        done = np.random.rand() > 0.95
        info = {}
        return obs, reward, done, info

env = DummyEnv()
check_env(env)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)
print("✅ PPO training on DummyEnv completed!")
