In [1]:
import gymnasium as gym
from pogema import GridConfig

from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Set Global parameters
GRID_SIZE = 8
DENSITY = 0.3
OBS_RADIUS = 5
MAX_EPISODE_STEPS = 64
SAVE_PARAMS_PATH = 'saved/tuned_params.yml'
SAVE_METRICS_PATH = 'saved/evaluation_metrics.yml'
MODEL_NAME = 'DQN_A'

grid_config = GridConfig(
    size=GRID_SIZE,                         # size of the grid map 8 = (8x8)
    density=DENSITY,                        # obstacle density
    num_agents=1,                           # number of agents
    obs_radius=OBS_RADIUS,                  # defines field of view
    max_episode_steps=MAX_EPISODE_STEPS,    # time horizon
    seed=None                               # set to None for random obstacles, agents and targets positions at each reset
)

env = gym.make("Pogema-v0",grid_config=grid_config)

  logger.warn(
  logger.warn(


### Optuna Integration

In [2]:
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.2e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for DQN hyperparameters."""
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    # batch_size = 2 ** trial.suggest_int("batch_size", 3, 10)
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    # max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 10.0, log=True)
    # target_update_interval = trial.suggest_int("target_update_interval", 500, 20000, log=True)
    # exploration_fraction = trial.suggest_float("exploration_fraction", 0.1, 0.5, log=True)
    # exploration_final_eps = trial.suggest_float("exploration_final_eps", 0.01, 0.1, log=True)

    # Display true values.
    trial.set_user_attr("gamma", gamma)
    # trial.set_user_attr("batch_size", batch_size)

    return {
        "learning_rate": learning_rate,        
        # "batch_size": batch_size,
        "gamma": gamma,
        # "max_grad_norm": max_grad_norm,
        # "target_update_interval": target_update_interval,
        # "exploration_fraction": exploration_fraction,
        # "exploration_final_eps": exploration_final_eps
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_dqn_params(trial))
    # Create the RL model.
    model = DQN(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

  from .autonotebook import tqdm as notebook_tqdm
[I 2023-11-05 17:10:29,217] A new study created in memory with name: no-name-005ed701-f804-4363-970b-e32ffb1c7ed4


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11271    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 56.4     |
|    ep_rew_mean      | 0.375    |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12204    |
|    time_elapsed     | 0        |
|    total_timesteps  | 451      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.2     |
|    ep_rew_mean      | 0.333 

[I 2023-11-05 17:11:36,956] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 0.6999132554610601, 'gamma': 0.044003047178782874}. Best is trial 0 with value: 0.0.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 33.8     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11843    |
|    time_elapsed     | 0        |
|    total_timesteps  | 135      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 36       |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.977    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12897    |
|    time_elapsed     | 0        |
|    total_timesteps  | 288      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40.7     |
|    ep_rew_mean      | 0.5   



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.4     |
|    ep_rew_mean      | 0.344    |
|    exploration_rate | 0.76     |
| time/               |          |
|    episodes         | 64       |
|    fps              | 15663    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3031     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.4     |
|    ep_rew_mean      | 0.338    |
|    exploration_rate | 0.745    |
| time/               |          |
|    episodes         | 68       |
|    fps              | 15551    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3224     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.1     |
|    ep_rew_mean      | 0.333    |
|    exploration_rate | 0.726    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:12:42,528] Trial 1 finished with value: 0.3333333333333333 and parameters: {'learning_rate': 7.721318851702573e-05, 'gamma': 0.00032855502965601274}. Best is trial 1 with value: 0.3333333333333333.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.5     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 13638    |
|    time_elapsed     | 0        |
|    total_timesteps  | 206      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 57.8     |
|    ep_rew_mean      | 0.125    |
|    exploration_rate | 0.963    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 14718    |
|    time_elapsed     | 0        |
|    total_timesteps  | 462      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 59.8     |
|    ep_rew_mean      | 0.0833



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.5     |
|    ep_rew_mean      | 0.328    |
|    exploration_rate | 0.749    |
| time/               |          |
|    episodes         | 64       |
|    fps              | 15439    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3166     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.7     |
|    ep_rew_mean      | 0.338    |
|    exploration_rate | 0.738    |
| time/               |          |
|    episodes         | 68       |
|    fps              | 15393    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3313     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.9     |
|    ep_rew_mean      | 0.333    |
|    exploration_rate | 0.721    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:13:50,265] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 0.01770438698567265, 'gamma': 0.06731196970920077}. Best is trial 1 with value: 0.3333333333333333.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.2     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11799    |
|    time_elapsed     | 0        |
|    total_timesteps  | 197      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49       |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.969    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12862    |
|    time_elapsed     | 0        |
|    total_timesteps  | 392      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 45.7     |
|    ep_rew_mean      | 0.333 



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.2     |
|    ep_rew_mean      | 0.365    |
|    exploration_rate | 0.801    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 12541    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2509     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.4     |
|    ep_rew_mean      | 0.375    |
|    exploration_rate | 0.79     |
| time/               |          |
|    episodes         | 56       |
|    fps              | 12500    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2652     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.6     |
|    ep_rew_mean      | 0.367    |
|    exploration_rate | 0.774    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:14:58,834] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 0.02870452021553718, 'gamma': 0.0008787051233413245}. Best is trial 1 with value: 0.3333333333333333.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 60       |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 14694    |
|    time_elapsed     | 0        |
|    total_timesteps  | 240      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 62       |
|    ep_rew_mean      | 0.125    |
|    exploration_rate | 0.961    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 15969    |
|    time_elapsed     | 0        |
|    total_timesteps  | 496      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 61.8     |
|    ep_rew_mean      | 0.25  



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.4     |
|    ep_rew_mean      | 0.325    |
|    exploration_rate | 0.831    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 16404    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2134     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.6     |
|    ep_rew_mean      | 0.318    |
|    exploration_rate | 0.813    |
| time/               |          |
|    episodes         | 44       |
|    fps              | 16383    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2359     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.2     |
|    ep_rew_mean      | 0.312    |
|    exploration_rate | 0.798    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:16:06,735] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 0.004062483112048131, 'gamma': 0.002016539555258088}. Best is trial 1 with value: 0.3333333333333333.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 29.8     |
|    ep_rew_mean      | 0.75     |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8828     |
|    time_elapsed     | 0        |
|    total_timesteps  | 119      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 46.9     |
|    ep_rew_mean      | 0.375    |
|    exploration_rate | 0.97     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12492    |
|    time_elapsed     | 0        |
|    total_timesteps  | 375      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 43.4     |
|    ep_rew_mean      | 0.417 



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.3     |
|    ep_rew_mean      | 0.357    |
|    exploration_rate | 0.777    |
| time/               |          |
|    episodes         | 56       |
|    fps              | 15288    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2819     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.6     |
|    ep_rew_mean      | 0.383    |
|    exploration_rate | 0.76     |
| time/               |          |
|    episodes         | 60       |
|    fps              | 15219    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3036     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.4     |
|    ep_rew_mean      | 0.359    |
|    exploration_rate | 0.739    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:17:15,356] Trial 5 finished with value: 0.3333333333333333 and parameters: {'learning_rate': 1.7434653320027478e-05, 'gamma': 0.00013049194536823142}. Best is trial 1 with value: 0.3333333333333333.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53       |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.983    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 12874    |
|    time_elapsed     | 0        |
|    total_timesteps  | 212      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 45.1     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.971    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 13197    |
|    time_elapsed     | 0        |
|    total_timesteps  | 361      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 46.2     |
|    ep_rew_mean      | 0.417 



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 46.7     |
|    ep_rew_mean      | 0.406    |
|    exploration_rate | 0.764    |
| time/               |          |
|    episodes         | 64       |
|    fps              | 14896    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2986     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.4     |
|    ep_rew_mean      | 0.397    |
|    exploration_rate | 0.745    |
| time/               |          |
|    episodes         | 68       |
|    fps              | 15058    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3225     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.8     |
|    ep_rew_mean      | 0.389    |
|    exploration_rate | 0.728    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:18:23,513] Trial 6 finished with value: 0.3333333333333333 and parameters: {'learning_rate': 3.1005497531657405e-05, 'gamma': 0.00010740556040571663}. Best is trial 1 with value: 0.3333333333333333.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 25.5     |
|    ep_rew_mean      | 0.75     |
|    exploration_rate | 0.992    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10561    |
|    time_elapsed     | 0        |
|    total_timesteps  | 102      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 25.6     |
|    ep_rew_mean      | 0.875    |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9153     |
|    time_elapsed     | 0        |
|    total_timesteps  | 205      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.4     |
|    ep_rew_mean      | 0.583 



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.5     |
|    ep_rew_mean      | 0.354    |
|    exploration_rate | 0.812    |
| time/               |          |
|    episodes         | 48       |
|    fps              | 12456    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2375     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.6     |
|    ep_rew_mean      | 0.327    |
|    exploration_rate | 0.792    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 12564    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2631     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.6     |
|    ep_rew_mean      | 0.304    |
|    exploration_rate | 0.771    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:19:32,419] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 0.0001612621744173664, 'gamma': 0.0006421213152458299}. Best is trial 1 with value: 0.3333333333333333.


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 61.8     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9461     |
|    time_elapsed     | 0        |
|    total_timesteps  | 247      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 58.4     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.963    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10991    |
|    time_elapsed     | 0        |
|    total_timesteps  | 467      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.5     |
|    ep_rew_mean      | 0.333 



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.2     |
|    ep_rew_mean      | 0.365    |
|    exploration_rate | 0.793    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 13449    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2613     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.2     |
|    ep_rew_mean      | 0.375    |
|    exploration_rate | 0.782    |
| time/               |          |
|    episodes         | 56       |
|    fps              | 13296    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2757     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.8     |
|    ep_rew_mean      | 0.383    |
|    exploration_rate | 0.768    |
| time/               |          |
|    episodes       

[I 2023-11-05 17:20:41,082] Trial 8 finished with value: 0.0 and parameters: {'learning_rate': 0.00031116453206151925, 'gamma': 0.007856693886313047}. Best is trial 1 with value: 0.3333333333333333.


Number of finished trials:  9
Best trial:
  Value:  0.3333333333333333
  Params: 
    learning_rate: 7.721318851702573e-05
    gamma: 0.00032855502965601274
  User attrs:
    gamma: 0.999671444970344


### Save tuned hyperparameters

In [12]:
from lib.utils import *

if DEFAULT_HYPERPARAMS.get('env', None):
    del DEFAULT_HYPERPARAMS['env'] # remove env object from being saved as value in YAML file
    
save_model_params(trial, MODEL_NAME, SAVE_PARAMS_PATH, DEFAULT_HYPERPARAMS)