In [7]:
import gymnasium as gym
from pogema import GridConfig

from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

# dqn_model = DQN(
#     "MlpPolicy",
#     env,
#     verbose=1,
#     train_freq=16,
#     gradient_steps=8, # gradient steps to do after each rollout
#     gamma=0.99, # discount factor
#     exploration_fraction=0.4, 
#     exploration_final_eps=0.07,
#     target_update_interval=600, # update the target network afte fixed number of steps
#     learning_starts=1000, # how many steps of the model to collect transitions for before learning starts
#     buffer_size=10000, # size of the replay buffer
#     batch_size=128, # Minibatch size for each gradient update
#     learning_rate=4e-3,
#     policy_kwargs=dict(net_arch=[256, 256]),
#     seed=42,
#     tensorboard_log="./tensorboard"
# )

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Optuna Integration

In [8]:
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.2e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


ENV_ID = "Pogema-v0"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": ENV_ID
}

def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for DQN hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    exploration_fraction = trial.suggest_float("exploration_fraction", 0.1, 0.5, log=True)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    # net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])

    # Display true values.
    trial.set_user_attr("gamma_", gamma)

    # net_arch = [
    #     {"pi": [64], "vf": [64]} if net_arch == "tiny" else {"pi": [64, 64], "vf": [64, 64]}
    # ]

    return {
        "exploration_fraction": exploration_fraction,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "max_grad_norm": max_grad_norm,
        # "policy_kwargs": {
        #     "net_arch": net_arch
        # },
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_dqn_params(trial))
    # Create the RL model.
    model = DQN(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

[I 2023-11-02 13:47:08,789] A new study created in memory with name: no-name-94920176-0475-4c3d-b5ed-3d9fcf128d59


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52.2     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 13859    |
|    time_elapsed     | 0        |
|    total_timesteps  | 209      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.2     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 15182    |
|    time_elapsed     | 0        |
|    total_timesteps  | 402      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_

[I 2023-11-02 13:48:35,995] Trial 0 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.01628361908590592, 'max_grad_norm': 3.352129661507581, 'exploration_fraction': 0.26059486781914126, 'lr': 1.0830094918146383e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.995    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 14652    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 15260    |
|    time_elapsed     | 0        |
|    total_timesteps  | 512      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_

[I 2023-11-02 13:49:49,234] Trial 1 finished with value: 0.0 and parameters: {'gamma': 0.02175435633379501, 'max_grad_norm': 2.787292276307632, 'exploration_fraction': 0.4090469127942067, 'lr': 0.0003662621470440864}. Best is trial 0 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 39       |
|    ep_rew_mean      | 0.75     |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 12023    |
|    time_elapsed     | 0        |
|    total_timesteps  | 156      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 44.1     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12448    |
|    time_elapsed     | 0        |
|    total_timesteps  | 353      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52.9     |
|    ep_rew_mean      | 0.346    |
|    exploration_rate | 0.951    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 13537    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2749     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.3     |
|    ep_rew_mean      | 0.339    |
|    exploration_rate | 0.947    |
| time/               |          |
|    episodes         | 56       |
|    fps              | 13484    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2984     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52       |
|    ep_rew_mean      | 0.35     |
|    exploration_rate | 0.944    |
| time/               |          |
|    episodes       

[I 2023-11-02 13:51:07,658] Trial 2 finished with value: 0.0 and parameters: {'gamma': 0.003969616032647339, 'max_grad_norm': 1.4634999643731978, 'exploration_fraction': 0.442874789810634, 'lr': 0.13846852406107824}. Best is trial 0 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 57.5     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.987    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 15719    |
|    time_elapsed     | 0        |
|    total_timesteps  | 230      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.5     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.977    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 14921    |
|    time_elapsed     | 0        |
|    total_timesteps  | 428      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.4     |
|    ep_rew_mean      | 0.339    |
|    exploration_rate | 0.848    |
| time/               |          |
|    episodes         | 56       |
|    fps              | 14374    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2769     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.4     |
|    ep_rew_mean      | 0.333    |
|    exploration_rate | 0.834    |
| time/               |          |
|    episodes         | 60       |
|    fps              | 14408    |
|    time_elapsed     | 0        |
|    total_timesteps  | 3025     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.4     |
|    ep_rew_mean      | 0.344    |
|    exploration_rate | 0.824    |
| time/               |          |
|    episodes       

[I 2023-11-02 13:52:28,922] Trial 3 finished with value: 0.0 and parameters: {'gamma': 0.03585061906352209, 'max_grad_norm': 0.6000588342623594, 'exploration_fraction': 0.1446443119537732, 'lr': 0.0771912903638758}. Best is trial 0 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 37.5     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 12524    |
|    time_elapsed     | 0        |
|    total_timesteps  | 150      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.8     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.992    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10862    |
|    time_elapsed     | 0        |
|    total_timesteps  | 406      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.4     |
|    ep_rew_mean      | 0.208    |
|    exploration_rate | 0.95     |
| time/               |          |
|    episodes         | 48       |
|    fps              | 13379    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2657     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.1     |
|    ep_rew_mean      | 0.212    |
|    exploration_rate | 0.946    |
| time/               |          |
|    episodes         | 52       |
|    fps              | 13404    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2863     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 54.7     |
|    ep_rew_mean      | 0.214    |
|    exploration_rate | 0.942    |
| time/               |          |
|    episodes       

[I 2023-11-02 13:53:47,394] Trial 4 finished with value: 0.0 and parameters: {'gamma': 0.001580886937752515, 'max_grad_norm': 2.86026489215145, 'exploration_fraction': 0.42001440134750484, 'lr': 0.35194338716677687}. Best is trial 0 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 41.2     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8440     |
|    time_elapsed     | 0        |
|    total_timesteps  | 165      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 41.5     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9592     |
|    time_elapsed     | 0        |
|    total_timesteps  | 332      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_

[I 2023-11-02 13:55:09,348] Trial 5 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00015831219903028533, 'max_grad_norm': 4.811261163871077, 'exploration_fraction': 0.22232209101492664, 'lr': 1.0487400501265364e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.2     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.995    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 13454    |
|    time_elapsed     | 0        |
|    total_timesteps  | 153      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40       |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9955     |
|    time_elapsed     | 0        |
|    total_timesteps  | 320      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_

[I 2023-11-02 13:56:18,618] Trial 6 finished with value: 0.0 and parameters: {'gamma': 0.07273095267860695, 'max_grad_norm': 0.3203679814563251, 'exploration_fraction': 0.23193514076629365, 'lr': 1.0926053547540701e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cuda device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.5     |
|    ep_rew_mean      | 0.25     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9019     |
|    time_elapsed     | 0        |
|    total_timesteps  | 202      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 39.8     |
|    ep_rew_mean      | 0.5      |
|    exploration_rate | 0.976    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8488     |
|    time_elapsed     | 0        |
|    total_timesteps  | 318      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_

[I 2023-11-02 13:57:27,611] Trial 7 finished with value: 0.0 and parameters: {'gamma': 0.007436877045346131, 'max_grad_norm': 1.3663915689163786, 'exploration_fraction': 0.10434704932730175, 'lr': 0.0007388963659247938}. Best is trial 0 with value: 0.3333333333333333.


Number of finished trials:  8
Best trial:
  Value:  0.3333333333333333
  Params: 
    gamma: 0.01628361908590592
    max_grad_norm: 3.352129661507581
    exploration_fraction: 0.26059486781914126
    lr: 1.0830094918146383e-05
  User attrs:
    gamma_: 0.9837163809140941


### Evaluation