In [1]:
#@title Packages installation
!pip install -q gymnasium
!pip install -q stable_baselines3
!pip install -q gymnasium[mujoco]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.7/181.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# **Dataset**

This section is dedicated to collect data taking advantage of an expert policy provided by stable baseline. In this case the policy strategy used is PPO.

In [10]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

import torch

import numpy as np
from tqdm import tqdm

In [12]:
#@title Create the environment

env_id = "Humanoid-v4"
env = gym.make(env_id, render_mode="rgb_array")

# Instantiate the agent
expert_model = PPO("MlpPolicy", env_id, verbose=1)

# Train the agent
expert_model.learn(total_timesteps=3e4)

#evaluate expert
mean_reward, std_reward = evaluate_policy(expert_model, Monitor(env), n_eval_episodes=10)
print(f"Mean reward expert agent= {mean_reward} +/- {std_reward}")

Using cpu device
Creating environment from the given name 'Humanoid-v4'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.5     |
|    ep_rew_mean     | 110      |
| time/              |          |
|    fps             | 610      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.1        |
|    ep_rew_mean          | 108         |
| time/                   |             |
|    fps                  | 431         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020223018 |
|    clip_fraction        | 0.224       |
|    cl

In [13]:
#@title create expert dataset

#empty dataset
num_interactions = int(4e4)

expert_observations = np.empty((num_interactions,) + env.observation_space.shape)
expert_actions = np.empty((num_interactions,) + env.action_space.shape)

print(expert_observations.shape)
print(expert_actions.shape)

#collect experience usign expert policy
obs, _ = env.reset()
for i in tqdm(range(num_interactions)):
    action, _ = expert_model.predict(obs, deterministic=True)
    expert_observations[i] = obs
    expert_actions[i] = action
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    if done:
        obs, _ = env.reset()

(40000, 376)
(40000, 17)


100%|██████████| 40000/40000 [01:14<00:00, 533.56it/s]


In [14]:
#@title save dataset
np.savez_compressed(
   "expert-data",
   expert_actions=expert_actions,
   expert_observations=expert_observations,
   )

In [6]:
#@title dataset class
from torch.utils.data.dataset import Dataset, random_split

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions

    def __getitem__(self, index):
        return (self.observations[index], self.actions[index])

    def __len__(self):
        return len(self.observations)

In [8]:

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

#split in 80% training and 20%test
batch_size = 64
train_prop = 0.8
train_size = int(train_prop * len(expert_dataset))
test_size = len(expert_dataset) - train_size
train_expert_dataset, test_expert_dataset = random_split(expert_dataset, [train_size, test_size])

train_loader = torch.utils.data.DataLoader(  dataset=train_expert_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(  dataset=test_expert_dataset, batch_size=batch_size, shuffle=True)

  and should_run_async(code)
