In [39]:
import gymnasium as gym
from pogema import GridConfig

from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  logger.warn(
  logger.warn(


### Optuna Integration

In [40]:
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 200
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.2e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": "Pogema-v0"
}

def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for DQN hyperparameters."""
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    batch_size = 2 ** trial.suggest_int("batch_size", 3, 10)
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 10.0, log=True)
    target_update_interval = trial.suggest_int("target_update_interval", 500, 20000, log=True)
    exploration_fraction = trial.suggest_float("exploration_fraction", 0.1, 0.5, log=True)
    exploration_final_eps = trial.suggest_float("exploration_final_eps", 0.01, 0.1, log=True)

    # Display true values.
    trial.set_user_attr("gamma", gamma)
    trial.set_user_attr("batch_size", batch_size)

    return {
        "learning_rate": learning_rate,        
        "batch_size": batch_size,
        "gamma": gamma,
        "max_grad_norm": max_grad_norm,
        "target_update_interval": target_update_interval,
        "exploration_fraction": exploration_fraction,
        "exploration_final_eps": exploration_final_eps
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_dqn_params(trial))
    # Create the RL model.
    model = DQN(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

[I 2023-11-04 19:57:38,826] A new study created in memory with name: no-name-da4dad63-1538-4fc3-954d-0155c63311da


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 59.8     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.995    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 13863    |
|    time_elapsed     | 0        |
|    total_timesteps  | 239      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40.9     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12953    |
|    time_elapsed     | 0        |
|    total_timesteps  | 327      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 45.4     |
|    ep_rew_mean      | 0.458    |
|    exploration_rate | 0.957    |
| time/               |          |
|    episodes         | 48       |
|    fps              | 13847    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2178     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 44.5     |
|    ep_rew_mean      | 0.481    |
|    exploration_rate | 0.954    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 13740    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2316     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 45.9     |
|    ep_rew_mean      | 0.446    |
|    exploration_rate | 0.949    |
| time/               |          |
|    episodes       

[I 2023-11-04 19:58:46,487] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 0.07106824614220245, 'batch_size': 6, 'gamma': 0.06583044475636637, 'max_grad_norm': 1.061792276936406, 'target_update_interval': 2113, 'exploration_fraction': 0.41448530539027345, 'exploration_final_eps': 0.015034610792265648}. Best is trial 0 with value: 0.0.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 15520    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52.6     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 15868    |
|    time_elapsed     | 0        |
|    total_timesteps  | 421      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.1     |
|    ep_rew_mean      | 0.232    |
|    exploration_rate | 0.856    |
| time/               |          |
|    episodes         | 56       |
|    fps              | 16025    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3087     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.6     |
|    ep_rew_mean      | 0.267    |
|    exploration_rate | 0.85     |
| time/               |          |
|    episodes         | 60       |
|    fps              | 15877    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3219     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52.2     |
|    ep_rew_mean      | 0.297    |
|    exploration_rate | 0.845    |
| time/               |          |
|    episodes       

[I 2023-11-04 20:00:18,501] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 0.17807981876338272, 'batch_size': 9, 'gamma': 0.02528172871849498, 'max_grad_norm': 0.6160751639476835, 'target_update_interval': 3037, 'exploration_fraction': 0.17552335481328654, 'exploration_final_eps': 0.02063606015823451}. Best is trial 0 with value: 0.0.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 28.8     |
|    ep_rew_mean      | 0.75     |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 12938    |
|    time_elapsed     | 0        |
|    total_timesteps  | 115      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 46.4     |
|    ep_rew_mean      | 0.375    |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 14271    |
|    time_elapsed     | 0        |
|    total_timesteps  | 371      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_

[I 2023-11-04 20:01:50,939] Trial 2 finished with value: 0.3333333333333333 and parameters: {'learning_rate': 1.8407920478653214e-05, 'batch_size': 9, 'gamma': 0.0007565366907137496, 'max_grad_norm': 9.482308659624474, 'target_update_interval': 1888, 'exploration_fraction': 0.10891274494616725, 'exploration_final_eps': 0.022033917540945417}. Best is trial 2 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.8     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.995    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10447    |
|    time_elapsed     | 0        |
|    total_timesteps  | 199      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 43.9     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 11233    |
|    time_elapsed     | 0        |
|    total_timesteps  | 351      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 43.1     |
|    ep_rew_mean      | 0.458    |
|    exploration_rate | 0.946    |
| time/               |          |
|    episodes         | 48       |
|    fps              | 11886    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2069     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 43.5     |
|    ep_rew_mean      | 0.442    |
|    exploration_rate | 0.941    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 11970    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2262     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 43.8     |
|    ep_rew_mean      | 0.446    |
|    exploration_rate | 0.936    |
| time/               |          |
|    episodes       

[I 2023-11-04 20:03:28,514] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 0.47567393567209154, 'batch_size': 9, 'gamma': 0.0025388222329006216, 'max_grad_norm': 2.4316735578109374, 'target_update_interval': 2355, 'exploration_fraction': 0.31492059656538307, 'exploration_final_eps': 0.0177944154710885}. Best is trial 2 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52.5     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 7314     |
|    time_elapsed     | 0        |
|    total_timesteps  | 210      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 58.2     |
|    ep_rew_mean      | 0.125    |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7863     |
|    time_elapsed     | 0        |
|    total_timesteps  | 466      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.1     |
|    ep_rew_mean      | 0.3      |
|    exploration_rate | 0.957    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 9953     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2043     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51       |
|    ep_rew_mean      | 0.295    |
|    exploration_rate | 0.953    |
| time/               |          |
|    episodes         | 44       |
|    fps              | 9975     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2246     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52.1     |
|    ep_rew_mean      | 0.271    |
|    exploration_rate | 0.947    |
| time/               |          |
|    episodes       

[I 2023-11-04 20:05:05,739] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 0.11763161024568054, 'batch_size': 9, 'gamma': 0.012916028609569344, 'max_grad_norm': 3.5784798330209724, 'target_update_interval': 15129, 'exploration_fraction': 0.3877141350224833, 'exploration_final_eps': 0.01940066767534327}. Best is trial 2 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 60.8     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.983    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10336    |
|    time_elapsed     | 0        |
|    total_timesteps  | 243      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 58       |
|    ep_rew_mean      | 0.375    |
|    exploration_rate | 0.968    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12461    |
|    time_elapsed     | 0        |
|    total_timesteps  | 464      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.1     |
|    ep_rew_mean      | 0.396    |
|    exploration_rate | 0.84     |
| time/               |          |
|    episodes         | 48       |
|    fps              | 12679    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2311     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.3     |
|    ep_rew_mean      | 0.404    |
|    exploration_rate | 0.826    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 12716    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2510     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.2     |
|    ep_rew_mean      | 0.411    |
|    exploration_rate | 0.817    |
| time/               |          |
|    episodes       

[I 2023-11-04 20:06:15,444] Trial 5 finished with value: 0.0 and parameters: {'learning_rate': 1.0657989911977938e-05, 'batch_size': 3, 'gamma': 0.00015778053681214283, 'max_grad_norm': 9.8778200094334, 'target_update_interval': 621, 'exploration_fraction': 0.11383265396259325, 'exploration_final_eps': 0.05199672927811682}. Best is trial 2 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4864     |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 56.6     |
|    ep_rew_mean      | 0.125    |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6035     |
|    time_elapsed     | 0        |
|    total_timesteps  | 453      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.3     |
|    ep_rew_mean      | 0.333    |
|    exploration_rate | 0.859    |
| time/               |          |
|    episodes         | 36       |
|    fps              | 9511     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1774     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.8     |
|    ep_rew_mean      | 0.3      |
|    exploration_rate | 0.838    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 9775     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2030     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52       |
|    ep_rew_mean      | 0.273    |
|    exploration_rate | 0.818    |
| time/               |          |
|    episodes       

[I 2023-11-04 20:07:32,113] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 1.1378135295765253e-05, 'batch_size': 6, 'gamma': 0.0006552284642738712, 'max_grad_norm': 8.837266051262102, 'target_update_interval': 537, 'exploration_fraction': 0.10002716152460046, 'exploration_final_eps': 0.044879418208562366}. Best is trial 2 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8739     |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 57.4     |
|    ep_rew_mean      | 0.125    |
|    exploration_rate | 0.979    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10219    |
|    time_elapsed     | 0        |
|    total_timesteps  | 459      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.5     |
|    ep_rew_mean      | 0.3      |
|    exploration_rate | 0.905    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 10930    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2062     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.1     |
|    ep_rew_mean      | 0.318    |
|    exploration_rate | 0.898    |
| time/               |          |
|    episodes         | 44       |
|    fps              | 10457    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2204     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49       |
|    ep_rew_mean      | 0.354    |
|    exploration_rate | 0.891    |
| time/               |          |
|    episodes       

[I 2023-11-04 20:09:34,883] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 0.0003461190480944761, 'batch_size': 10, 'gamma': 0.00010999235374153814, 'max_grad_norm': 3.638394295721496, 'target_update_interval': 8377, 'exploration_fraction': 0.16234737108489664, 'exploration_final_eps': 0.0983211872282596}. Best is trial 2 with value: 0.3333333333333333.


Number of finished trials:  8
Best trial:
  Value:  0.3333333333333333
  Params: 
    learning_rate: 1.8407920478653214e-05
    batch_size: 9
    gamma: 0.0007565366907137496
    max_grad_norm: 9.482308659624474
    target_update_interval: 1888
    exploration_fraction: 0.10891274494616725
    exploration_final_eps: 0.022033917540945417
  User attrs:
    gamma: 0.9992434633092863
    batch_size: 512


### Save tuned hyperparameters

In [41]:
SAVE_PARAMS_PATH = 'saved/tuned_params.yml'
SAVE_METRICS_PATH = 'saved/evaluation_metrics.yml'
MODEL_NAME = 'DQN'

In [42]:
from lib.utils import *

save_model_params(trial, MODEL_NAME, SAVE_PARAMS_PATH, DEFAULT_HYPERPARAMS)

### Evaluation

In [43]:
model_params = get_model_log(MODEL_NAME, SAVE_PARAMS_PATH)
model = load_model_params(DQN, model_params)

metrics = evaluate_metrics(model, env, num_episodes=30, num_trials=100, verbose=False)
save_metrics(metrics, MODEL_NAME, SAVE_METRICS_PATH)
print(f"Agent Success Rate: {metrics['success_rate'] * 100:.2f}%")
print(f"Steps to termination : {metrics['step_array']}")
print(f"Average steps to termination : {metrics['ave_steps']}")

Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Agent Success Rate: 3.00%
Steps to termination : [2, 3, 1]
Average steps to termination : 2.0
