In [14]:
from lib.utils import * 
import gymnasium as gym
from pogema import GridConfig

from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

env_path = 'saved/diff_env.yml'
ENV_NAME = 'ENV_C'
SAVE_PARAMS_PATH = 'saved/tuned_params.yml'
SAVE_METRICS_PATH = 'saved/evaluation_metrics.yml'
ENV_PARAMS = get_model_log(ENV_NAME, env_path) #load env params 
MODEL_NAME = 'PPO_C'
MAX_TRIALS = 1000                                   # For evaluation_metrics()
MAX_EPISODE_STEPS = ENV_PARAMS['MAX_EPISODE_STEPS']

grid_config = GridConfig(
    size=ENV_PARAMS['GRID_SIZE'],                                 # size of the grid map 8 = (8x8)
    density=ENV_PARAMS['DENSITY'],                                # obstacle density
    num_agents=1,                                   # number of agents
    obs_radius=ENV_PARAMS['OBS_RADIUS'],                          # defines field of view
    max_episode_steps=ENV_PARAMS['MAX_EPISODE_STEPS'],            # time horizon
    seed=None                                       # set to None for random obstacles, agents and targets positions at each reset
)

env = gym.make("Pogema-v0",grid_config=grid_config)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Optuna Integration

In [15]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.2e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3



DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for ppo hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    # max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    # gae_lambda = 1.0 - trial.suggest_float("gae_lambda", 0.001, 0.2, log=True)
    # n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    # ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    # ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    # net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    # activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # Display true values.
    trial.set_user_attr("gamma_", gamma)


    return {
        # "n_steps": n_steps,
        "gamma": gamma,
        # "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        # "ent_coef": ent_coef,
        # "max_grad_norm": max_grad_norm,
        # "policy_kwargs": {
        #     "net_arch": net_arch,
        #     "activation_fn": activation_fn,
        #     "ortho_init": ortho_init,
        # },
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_ppo_params(trial))
    # Create the RL model.
    model = PPO(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


[I 2023-11-10 17:44:00,700] A new study created in memory with name: no-name-254991dc-da23-41df-9280-e135f97e10ef


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 84.4     |
|    ep_rew_mean     | 0.458    |
| time/              |          |
|    fps             | 2102     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 86.9       |
|    ep_rew_mean          | 0.426      |
| time/                   |            |
|    fps                  | 1462       |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.07482131 |
|    clip_fraction        | 0.556      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.55

[I 2023-11-10 17:45:44,918] Trial 0 finished with value: 0.0 and parameters: {'gamma': 0.04667882699473575, 'lr': 0.015052893416740975}. Best is trial 0 with value: 0.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92.3     |
|    ep_rew_mean     | 0.381    |
| time/              |          |
|    fps             | 2416     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 96.6        |
|    ep_rew_mean          | 0.357       |
| time/                   |             |
|    fps                  | 1546        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.032773793 |
|    clip_fraction        | 0.349       |
|    clip_range           | 0.2         |
|    entropy_loss   

[I 2023-11-10 17:47:37,809] Trial 1 finished with value: 1.0 and parameters: {'gamma': 0.007911248936180934, 'lr': 0.0042933221799210716}. Best is trial 1 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 85.1     |
|    ep_rew_mean     | 0.458    |
| time/              |          |
|    fps             | 2322     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 90.4          |
|    ep_rew_mean          | 0.4           |
| time/                   |               |
|    fps                  | 1483          |
|    iterations           | 2             |
|    time_elapsed         | 2             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00060448237 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2       

[I 2023-11-10 17:49:35,778] Trial 2 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00016395747981666782, 'lr': 3.387887233509309e-05}. Best is trial 1 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 102      |
|    ep_rew_mean     | 0.3      |
| time/              |          |
|    fps             | 2035     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 113      |
|    ep_rew_mean          | 0.167    |
| time/                   |          |
|    fps                  | 1375     |
|    iterations           | 2        |
|    time_elapsed         | 2        |
|    total_timesteps      | 4096     |
| train/                  |          |
|    approx_kl            | 75.79234 |
|    clip_fraction        | 0.997    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.00706 |
|    explained_varia

[I 2023-11-10 17:51:22,157] Trial 3 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.029898805238107323, 'lr': 0.4642581798374178}. Best is trial 1 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 85.2     |
|    ep_rew_mean     | 0.478    |
| time/              |          |
|    fps             | 3156     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 100       |
|    ep_rew_mean          | 0.3       |
| time/                   |           |
|    fps                  | 2084      |
|    iterations           | 2         |
|    time_elapsed         | 1         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 27.669518 |
|    clip_fraction        | 0.996     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.0114   |
|    e

[I 2023-11-10 17:52:45,353] Trial 4 finished with value: 0.0 and parameters: {'gamma': 0.002681984863029761, 'lr': 0.1166202698487769}. Best is trial 1 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 81       |
|    ep_rew_mean     | 0.52     |
| time/              |          |
|    fps             | 3135     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 91          |
|    ep_rew_mean          | 0.422       |
| time/                   |             |
|    fps                  | 2073        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016933601 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.2         |
|    entropy_loss   

[I 2023-11-10 17:54:08,501] Trial 5 finished with value: 1.0 and parameters: {'gamma': 0.004307843034301622, 'lr': 0.0007287922498519077}. Best is trial 1 with value: 1.0.


Number of finished trials:  6
Best trial:
  Value:  1.0
  Params: 
    gamma: 0.007911248936180934
    lr: 0.0042933221799210716
  User attrs:
    gamma_: 0.9920887510638191


### Save tuned hyperparameters

In [13]:
from lib.utils import *

if DEFAULT_HYPERPARAMS.get('env', None):
    del DEFAULT_HYPERPARAMS['env'] # remove env object from being saved as value in YAML file
    
save_model_params(trial, MODEL_NAME, SAVE_PARAMS_PATH, DEFAULT_HYPERPARAMS)