In [2]:
import gymnasium as gym
import pogema
from pogema import GridConfig
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from lib.utils import * 

%load_ext autoreload
%autoreload 2
%matplotlib inline

GRID_LEN = 18

def custom_step(self, action: list):
        assert len(action) == self.grid_config.num_agents
        rewards = []
        terminated = []

        self.move_agents(action)
        self.update_was_on_goal()

        for agent_idx in range(self.grid_config.num_agents):

            c_x, c_y = self.grid.positions_xy[agent_idx]
            f_x, f_y = self.grid.finishes_xy[agent_idx]

            reward = - ( (abs(c_x - f_x) + abs(c_y - f_y)) / (2 * GRID_LEN) )
            #print(f"[CURR] {c_x}, {c_y} [FINISH] {f_x}, {f_y} [DIST] {d} [REWARD] {reward}")

            on_goal = self.grid.on_goal(agent_idx)
            if on_goal and self.grid.is_active[agent_idx]:
                print("FINISH", reward)
                rewards.append(100)
            else:
                rewards.append(reward)
            terminated.append(on_goal)

        for agent_idx in range(self.grid_config.num_agents):
            if self.grid.on_goal(agent_idx):
                self.grid.hide_agent(agent_idx)
                self.grid.is_active[agent_idx] = False

        infos = self._get_infos()

        observations = self._obs()
        truncated = [False] * self.grid_config.num_agents
        return observations, rewards, terminated, truncated, infos

pogema.envs.Pogema.step = custom_step

env_path = 'saved/diff_env.yml'
ENV_NAME = 'ENV_D'
SAVE_PARAMS_PATH = 'saved/tuned_params.yml'
SAVE_METRICS_PATH = 'saved/evaluation_metrics.yml'
ENV_PARAMS = get_model_log(ENV_NAME, env_path) #load env params 
MODEL_NAME = 'PPO_D'

grid_config = GridConfig(
    size=ENV_PARAMS['GRID_SIZE'],                                 # size of the grid map 8 = (8x8)
    density=ENV_PARAMS['DENSITY'],                                # obstacle density
    num_agents=1,                                   # number of agents
    obs_radius=ENV_PARAMS['OBS_RADIUS'],                          # defines field of view
    max_episode_steps=ENV_PARAMS['MAX_EPISODE_STEPS'],            # time horizon
    seed=None                                       # set to None for random obstacles, agents and targets positions at each reset
)

env = gym.make("Pogema-v0",grid_config=grid_config)


  logger.warn(
  logger.warn(


### Optuna Integration 

In [3]:
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.2e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for DQN hyperparameters."""
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    # batch_size = 2 ** trial.suggest_int("batch_size", 3, 10)
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    # max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 10.0, log=True)
    # target_update_interval = trial.suggest_int("target_update_interval", 500, 20000, log=True)
    # exploration_fraction = trial.suggest_float("exploration_fraction", 0.1, 0.5, log=True)
    # exploration_final_eps = trial.suggest_float("exploration_final_eps", 0.01, 0.1, log=True)

    # Display true values.
    trial.set_user_attr("gamma", gamma)
    # trial.set_user_attr("batch_size", batch_size)

    return {
        "learning_rate": learning_rate,        
        # "batch_size": batch_size,
        "gamma": gamma,
        # "max_grad_norm": max_grad_norm,
        # "target_update_interval": target_update_interval,
        # "exploration_fraction": exploration_fraction,
        # "exploration_final_eps": exploration_final_eps
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_dqn_params(trial))
    # Create the RL model.
    model = DQN(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

[I 2023-11-11 15:03:57,110] A new study created in memory with name: no-name-93c6023f-d82a-4d8c-9c74-016903d93edd


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.2     |
|    ep_rew_mean      | 15.1     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8101     |
|    time_elapsed     | 0        |
|    total_timesteps  | 201      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 46.8     |
|    ep_rew_mean      | 30.1     |
|    exploration_rate | 0.97     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8402     |
|    time_elapsed     | 0        |
|    total_timesteps  | 374      |
----------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |      



FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.1     |
|    ep_rew_mean      | 23.1     |
|    exploration_rate | 0.876    |
| time/               |          |
|    episodes         | 32       |
|    fps              | 8775     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1572     |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.8     |
|    ep_rew_mean      | 25.3     |
|    exploration_rate | 0.858    |
| time/               |          |
|    episodes         | 36       |
|    fps              | 8882     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1792     |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.1     |
|    ep_rew_mean      | 27.4     |
|    exploration_rate | 0.848 

[I 2023-11-11 15:04:43,100] Trial 0 finished with value: 29.685185 and parameters: {'learning_rate': 2.057334189646879e-05, 'gamma': 0.000233006857915395}. Best is trial 0 with value: 29.685185.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.5     |
|    ep_rew_mean      | 17.6     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9841     |
|    time_elapsed     | 0        |
|    total_timesteps  | 194      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.1     |
|    ep_rew_mean      | 18.3     |
|    exploration_rate | 0.966    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9268     |
|    time_elapsed     | 0        |
|    total_timesteps  | 425      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean  



FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.1     |
|    ep_rew_mean      | 32.9     |
|    exploration_rate | 0.848    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 9430     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1926     |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.2     |
|    ep_rew_mean      | 33.9     |
|    exploration_rate | 0.832    |
| time/               |          |
|    episodes         | 44       |
|    fps              | 9443     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2121     |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.5     |
|    ep_rew_mean      | 32.6     |
|    exploration_rate | 0.816 

[I 2023-11-11 15:05:27,299] Trial 1 finished with value: -13.037037 and parameters: {'learning_rate': 0.014680580244208175, 'gamma': 0.004803565445097014}. Best is trial 0 with value: 29.685185.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.2     |
|    ep_rew_mean      | 15.2     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10207    |
|    time_elapsed     | 0        |
|    total_timesteps  | 193      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40.9     |
|    ep_rew_mean      | 28.8     |
|    exploration_rate | 0.974    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9269     |
|    time_elapsed     | 0        |
|    total_timesteps  | 327      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean  



FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51       |
|    ep_rew_mean      | 16.6     |
|    exploration_rate | 0.822    |
| time/               |          |
|    episodes         | 44       |
|    fps              | 10633    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2243     |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.5     |
|    ep_rew_mean      | 18.8     |
|    exploration_rate | 0.812    |
| time/               |          |
|    episodes         | 48       |
|    fps              | 10514    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2375     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.6     |
|    ep_rew_mean      | 16.7     |
|    exploration_rate | 0.792    |
| time/            

[I 2023-11-11 15:06:09,702] Trial 2 finished with value: 99.861111 and parameters: {'learning_rate': 0.00035539212878495964, 'gamma': 0.0057968231952958225}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | -9.12    |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8718     |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.9     |
|    ep_rew_mean      | 18.6     |
|    exploration_rate | 0.968    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8911     |
|    time_elapsed     | 0        |
|    total_timesteps  | 407      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.2     |
|    ep



FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.7     |
|    ep_rew_mean      | 11.3     |
|    exploration_rate | 0.841    |
| time/               |          |
|    episodes         | 36       |
|    fps              | 10702    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2004     |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 53.5     |
|    ep_rew_mean      | 14.7     |
|    exploration_rate | 0.831    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 10509    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2140     |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 54.2     |
|    ep_rew_mean      | 14.9     |
|    exploration_rate | 0.811    |
| time/

[I 2023-11-11 15:06:49,972] Trial 3 finished with value: 24.731481333333335 and parameters: {'learning_rate': 0.0013975091374166613, 'gamma': 0.0036351178454157146}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55.8     |
|    ep_rew_mean      | 13.3     |
|    exploration_rate | 0.982    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11446    |
|    time_elapsed     | 0        |
|    total_timesteps  | 223      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 47.1     |
|    ep_rew_mean      | 27.6     |
|    exploration_rate | 0.97     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 11025    |
|    time_elapsed     | 0        |
|    total_timesteps  | 377      |
----------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |      

[I 2023-11-11 15:07:35,224] Trial 4 finished with value: -9.175926 and parameters: {'learning_rate': 0.002145062507058776, 'gamma': 0.0004494121746082307}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.2     |
|    ep_rew_mean      | 43.3     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8759     |
|    time_elapsed     | 0        |
|    total_timesteps  | 205      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 44.2     |
|    ep_rew_mean      | 43.4     |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8602     |
|    time_elapsed     | 0        |
|    total_timesteps  | 354      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    e

[I 2023-11-11 15:07:45,852] Trial 5 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.2     |
|    ep_rew_mean      | 45.4     |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3760     |
|    time_elapsed     | 0        |
|    total_timesteps  | 153      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 44.5     |
|    ep_rew_mean      | 31       |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5743     |
|    time_elapsed     | 0        |
|    total_timesteps  | 356      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean  

[I 2023-11-11 15:07:56,256] Trial 6 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 30.2     |
|    ep_rew_mean      | 72       |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8518     |
|    time_elapsed     | 0        |
|    total_timesteps  | 121      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.4     |
|    ep_rew_mean      | 58.1     |
|    exploration_rate | 0.976    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8471     |
|    time_elapsed     | 0        |
|    total_timesteps  | 307      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |      

[I 2023-11-11 15:08:06,634] Trial 7 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 49.8     |
|    ep_rew_mean      | 16.7     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10068    |
|    time_elapsed     | 0        |
|    total_timesteps  | 199      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 56.9     |
|    ep_rew_mean      | 3.39     |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10549    |
|    time_elapsed     | 0        |
|    total_timesteps  | 455      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 59.2     |
|    ep_rew_mean   



FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.6     |
|    ep_rew_mean      | 28.3     |
|    exploration_rate | 0.888    |
| time/               |          |
|    episodes         | 28       |
|    fps              | 7413     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1416     |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.3     |
|    ep_rew_mean      | 30.2     |
|    exploration_rate | 0.872    |
| time/               |          |
|    episodes         | 32       |
|    fps              | 7570     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1611     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.9     |
|    ep_rew_mean      | 25.8     |
|    exploration_rate | 0.852    |
| time/            

[I 2023-11-11 15:08:52,336] Trial 8 finished with value: 24.70370366666667 and parameters: {'learning_rate': 0.00024629216776408394, 'gamma': 0.00010192386498339565}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 58.8     |
|    ep_rew_mean      | 14.2     |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9304     |
|    time_elapsed     | 0        |
|    total_timesteps  | 235      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 61.4     |
|    ep_rew_mean      | 1.91     |
|    exploration_rate | 0.961    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7577     |
|    time_elapsed     | 0        |
|    total_timesteps  | 491      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 59.2     |
|    ep

[I 2023-11-11 15:09:02,802] Trial 9 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 43.5     |
|    ep_rew_mean      | 42.3     |
|    exploration_rate | 0.986    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9008     |
|    time_elapsed     | 0        |
|    total_timesteps  | 174      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48       |
|    ep_rew_mean      | 29.5     |
|    exploration_rate | 0.97     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9215     |
|    time_elapsed     | 0        |
|    total_timesteps  | 384      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean  

[I 2023-11-11 15:09:47,633] Trial 10 finished with value: 30.259259333333333 and parameters: {'learning_rate': 0.00011458610243637869, 'gamma': 0.0011787037134829317}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | -8.92    |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11410    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 59       |
|    ep_rew_mean      | 3.38     |
|    exploration_rate | 0.963    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10782    |
|    time_elapsed     | 0        |
|    total_timesteps  | 472      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 58.6     |
|    ep

[I 2023-11-11 15:09:58,128] Trial 11 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 37.8     |
|    ep_rew_mean      | 45.2     |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 7827     |
|    time_elapsed     | 0        |
|    total_timesteps  | 151      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 38.5     |
|    ep_rew_mean      | 45.6     |
|    exploration_rate | 0.976    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8105     |
|    time_elapsed     | 0        |
|    total_timesteps  | 308      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |      

[I 2023-11-11 15:10:08,485] Trial 12 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | -11.8    |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11336    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 52.4     |
|    ep_rew_mean      | 16.5     |
|    exploration_rate | 0.967    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10459    |
|    time_elapsed     | 0        |
|    total_timesteps  | 419      |
----------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |      

[I 2023-11-11 15:10:18,917] Trial 13 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 61.5     |
|    ep_rew_mean      | 17.6     |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8218     |
|    time_elapsed     | 0        |
|    total_timesteps  | 246      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 62.8     |
|    ep_rew_mean      | 3.66     |
|    exploration_rate | 0.96     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5371     |
|    time_elapsed     | 0        |
|    total_timesteps  | 502      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 59.2     |
|    ep

[I 2023-11-11 15:10:29,193] Trial 14 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.2     |
|    ep_rew_mean      | 16.4     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10084    |
|    time_elapsed     | 0        |
|    total_timesteps  | 205      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 43.5     |
|    ep_rew_mean      | 29.6     |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9205     |
|    time_elapsed     | 0        |
|    total_timesteps  | 348      |
----------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |      

[I 2023-11-11 15:10:40,140] Trial 15 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | -7.28    |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6708     |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 61.4     |
|    ep_rew_mean      | 3.9      |
|    exploration_rate | 0.961    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5609     |
|    time_elapsed     | 0        |
|    total_timesteps  | 491      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 54.3  

[I 2023-11-11 15:10:50,584] Trial 16 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 32.5     |
|    ep_rew_mean      | 95.6     |
|    exploration_rate | 0.99     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1887     |
|    time_elapsed     | 0        |
|    total_timesteps  | 130      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 40.4     |
|    ep_rew_mean      | 57.3     |
|    exploration_rate | 0.974    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3601     |
|    time_elapsed     | 0        |
|    total_timesteps  | 323      |
----------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rol

[I 2023-11-11 15:11:35,492] Trial 17 finished with value: 26.222222333333335 and parameters: {'learning_rate': 6.284165634963678e-05, 'gamma': 0.000569841430626611}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 44.8     |
|    ep_rew_mean      | 45.3     |
|    exploration_rate | 0.986    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2090     |
|    time_elapsed     | 0        |
|    total_timesteps  | 179      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.9     |
|    ep_rew_mean      | 30.2     |
|    exploration_rate | 0.968    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3378     |
|    time_elapsed     | 0        |
|    total_timesteps  | 407      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    e

[I 2023-11-11 15:12:19,835] Trial 18 finished with value: -4.7592593333333335 and parameters: {'learning_rate': 0.0004462042601034459, 'gamma': 0.002029191661329031}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | -10.8    |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10695    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.9     |
|    ep_rew_mean      | 17.3     |
|    exploration_rate | 0.969    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8705     |
|    time_elapsed     | 0        |
|    total_timesteps  | 391      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 51.6  

[I 2023-11-11 15:12:30,293] Trial 19 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 64       |
|    ep_rew_mean      | -9.15    |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10989    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 56.4     |
|    ep_rew_mean      | 4.22     |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10373    |
|    time_elapsed     | 0        |
|    total_timesteps  | 451      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.9  

[I 2023-11-11 15:12:40,684] Trial 20 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 37.2     |
|    ep_rew_mean      | 43.2     |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6115     |
|    time_elapsed     | 0        |
|    total_timesteps  | 149      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50.5     |
|    ep_rew_mean      | 30.7     |
|    exploration_rate | 0.968    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5571     |
|    time_elapsed     | 0        |
|    total_timesteps  | 404      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 55    

[I 2023-11-11 15:12:51,452] Trial 21 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 58.8     |
|    ep_rew_mean      | 15.1     |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8928     |
|    time_elapsed     | 0        |
|    total_timesteps  | 235      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 61.4     |
|    ep_rew_mean      | 1.7      |
|    exploration_rate | 0.961    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7856     |
|    time_elapsed     | 0        |
|    total_timesteps  | 491      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 57.7     |
|    ep

[I 2023-11-11 15:13:34,029] Trial 22 finished with value: 64.54629633333333 and parameters: {'learning_rate': 3.505742223299215e-05, 'gamma': 0.0002723607267197365}. Best is trial 2 with value: 99.861111.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 48.8     |
|    ep_rew_mean      | 19.5     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8455     |
|    time_elapsed     | 0        |
|    total_timesteps  | 195      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 56.4     |
|    ep_rew_mean      | 5.08     |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6549     |
|    time_elapsed     | 0        |
|    total_timesteps  | 451      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 58.9     |
|    ep_rew_mean   

[I 2023-11-11 15:13:44,429] Trial 23 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 36.8     |
|    ep_rew_mean      | 45       |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3359     |
|    time_elapsed     | 0        |
|    total_timesteps  | 147      |
----------------------------------
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 42       |
|    ep_rew_mean      | 43.7     |
|    exploration_rate | 0.973    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3185     |
|    time_elapsed     | 0        |
|    total_timesteps  | 336      |
----------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/       

[I 2023-11-11 15:13:55,900] Trial 24 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 36.2     |
|    ep_rew_mean      | 44.8     |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 7887     |
|    time_elapsed     | 0        |
|    total_timesteps  | 145      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 42.2     |
|    ep_rew_mean      | 31.9     |
|    exploration_rate | 0.973    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6854     |
|    time_elapsed     | 0        |
|    total_timesteps  | 338      |
----------------------------------
FINISH -0.0
----------------------------------
| rollout/            |          |
|    ep_len_mean  

[I 2023-11-11 15:14:07,136] Trial 25 pruned. 


Number of finished trials:  26
Best trial:
  Value:  99.861111
  Params: 
    learning_rate: 0.00035539212878495964
    gamma: 0.0057968231952958225
  User attrs:
    gamma: 0.9942031768047042


### Saved tuned hyperparameters

In [4]:
from lib.utils import *

if DEFAULT_HYPERPARAMS.get('env', None):
    del DEFAULT_HYPERPARAMS['env'] # remove env object from being saved as value in YAML file
    
save_model_params(trial, MODEL_NAME, SAVE_PARAMS_PATH, DEFAULT_HYPERPARAMS)