In [None]:
!pip install stable-baselines3
!pip install sb3-contrib
!pip install optuna

Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, stable-baselines3
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 stable-baselines3-2.3

In [None]:
import gym
import numpy as np
from stable_baselines3 import PPO, A2C, SAC, TD3, DQN
from sb3_contrib import QRDQN, TQC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances
from typing import Any, Dict
import torch
import torch.nn as nn
from stable_baselines3.common.callbacks import EvalCallback
import torch as th
import pandas as pd


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
N_TRIALS = 100
N_JOBS = 1
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(2E4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 15)
ENV_ID = 'CartPole-v1'
DEFAULT_HYPERPARAMS = {
    'policy': 'MlpPolicy',
    'env': ENV_ID
}

In [None]:
def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:

  gamma = 1.0 - trial.suggest_float('gamma', 0.0001, 0.1, log=True)
  max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 5.0, log=True)
  n_steps = 2 ** trial.suggest_int('exponent_n_steps', 3, 10)
  learning_rate = trial.suggest_float('learning_rate', 1e-5, 1, log=True)
  net_arch = trial.suggest_categorical('net_arch', ['tiny', 'small'])
  activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])

  trial.set_user_attr('gamma', gamma)
  trial.set_user_attr('n_steps', n_steps)

  net_arch = [
      {'pi': [64], 'vf': [64]} if net_arch == 'tiny'
      else {'pi': [64, 64], 'vf': [64, 64]}
  ]

  activation_fn = {'tanh': nn.Tanh, 'relu': nn.ReLU}[activation_fn]

  return {
      'n_steps': n_steps,
      'gamma': gamma,
      'learning_rate': learning_rate,
      'max_grad_norm': max_grad_norm,
      'policy_kwargs': {
          'net_arch': net_arch,
          'activation_fn': activation_fn
      }
  }

In [None]:
class TrialEvalCallback(EvalCallback):
  def __init__(
      self,
      eval_env: gym.Env,
      trial: optuna.Trial,
      n_eval_episodes: int = 5,
      eval_freq: int = 10000,
      deterministic: bool = True,
      verbose: int = 0
  ):
      super().__init__(
          eval_env=eval_env,
          n_eval_episodes=n_eval_episodes,
          eval_freq=eval_freq,
          deterministic=deterministic,
          verbose=verbose
      )
      self.trial = trial
      self.eval_idx = 0
      self.is_pruned = False

  def _on_step(self) -> bool:
    if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
      super()._on_step()
      self.eval_idx += 1
      self.trial.report(self.last_mean_reward, self.eval_idx)
      if self.trial.should_prune():
        self.is_pruned = True
        return False
    return True

In [None]:
def objective(trial: optuna.Trial) -> float:

  kwargs = DEFAULT_HYPERPARAMS.copy()

  sampled_params = sample_a2c_params(trial)
  kwargs.update(sampled_params)

  model = A2C(**kwargs)

  eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS)

  eval_callback = TrialEvalCallback(
    eval_env=eval_envs,
    trial=trial,
    n_eval_episodes=N_EVAL_EPISODES,
    eval_freq=EVAL_FREQ,
    deterministic=True,
    verbose=1
  )

  nan_encountered = False
  try:
    model.learn(N_TIMESTEPS, callback=eval_callback)
  except AssertionError as e:
    print(e)
    nan_encountered = True
  finally:
    model.env.close()
    eval_envs.close()

  if nan_encountered:
    return float("nan")
  if eval_callback.is_pruned:
    raise optuna.exceptions.TrialPruned()

  return eval_callback.last_mean_reward

In [None]:
th.set_num_threads(1)
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print(f"Number of finished trials: {len(study.trials)}")

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

[I 2024-09-07 08:23:04,051] A new study created in memory with name: no-name-d405db12-7247-44f0-9c64-24d886fd0722


Eval num_timesteps=10000, episode_reward=9.30 +/- 0.64
Episode length: 9.30 +/- 0.64
New best mean reward!


[I 2024-09-07 08:23:40,393] Trial 0 finished with value: 9.7 and parameters: {'gamma': 0.013270456736615008, 'max_grad_norm': 1.876085544210284, 'exponent_n_steps': 7, 'learning_rate': 0.06457704378453519, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 9.7.


Eval num_timesteps=20000, episode_reward=9.70 +/- 0.46
Episode length: 9.70 +/- 0.46
New best mean reward!




Eval num_timesteps=10000, episode_reward=139.30 +/- 31.67
Episode length: 139.30 +/- 31.67
New best mean reward!
Eval num_timesteps=20000, episode_reward=110.20 +/- 56.44
Episode length: 110.20 +/- 56.44


[I 2024-09-07 08:24:13,521] Trial 1 finished with value: 110.2 and parameters: {'gamma': 0.08895572488120546, 'max_grad_norm': 0.38127500530463115, 'exponent_n_steps': 10, 'learning_rate': 2.938299583374232e-05, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 1 with value: 110.2.


Eval num_timesteps=10000, episode_reward=422.90 +/- 120.53
Episode length: 422.90 +/- 120.53
New best mean reward!
Eval num_timesteps=20000, episode_reward=367.90 +/- 133.94
Episode length: 367.90 +/- 133.94


[I 2024-09-07 08:24:44,741] Trial 2 finished with value: 367.9 and parameters: {'gamma': 0.00036886598275994977, 'max_grad_norm': 3.0024377495074352, 'exponent_n_steps': 9, 'learning_rate': 0.0015830486177545032, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 2 with value: 367.9.


Eval num_timesteps=10000, episode_reward=157.10 +/- 41.04
Episode length: 157.10 +/- 41.04
New best mean reward!


[I 2024-09-07 08:25:18,940] Trial 3 finished with value: 500.0 and parameters: {'gamma': 0.0077638455987499, 'max_grad_norm': 4.903937907385273, 'exponent_n_steps': 7, 'learning_rate': 0.004403423670865376, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=123.30 +/- 34.37
Episode length: 123.30 +/- 34.37
New best mean reward!
Eval num_timesteps=20000, episode_reward=129.10 +/- 35.12
Episode length: 129.10 +/- 35.12
New best mean reward!


[I 2024-09-07 08:25:51,189] Trial 4 finished with value: 129.1 and parameters: {'gamma': 0.051280879535807185, 'max_grad_norm': 2.9605320254790533, 'exponent_n_steps': 8, 'learning_rate': 2.09208299461036e-05, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.
[I 2024-09-07 08:26:13,044] Trial 5 pruned. 


Eval num_timesteps=10000, episode_reward=106.00 +/- 4.82
Episode length: 106.00 +/- 4.82
New best mean reward!


[I 2024-09-07 08:26:31,724] Trial 6 pruned. 


Eval num_timesteps=10000, episode_reward=9.50 +/- 0.81
Episode length: 9.50 +/- 0.81
New best mean reward!
Eval num_timesteps=10000, episode_reward=457.90 +/- 51.95
Episode length: 457.90 +/- 51.95
New best mean reward!


[I 2024-09-07 08:27:08,747] Trial 7 finished with value: 475.2 and parameters: {'gamma': 0.000135460990013694, 'max_grad_norm': 1.0795065094143843, 'exponent_n_steps': 5, 'learning_rate': 0.0008630005508430798, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=475.20 +/- 50.24
Episode length: 475.20 +/- 50.24
New best mean reward!


[I 2024-09-07 08:27:25,122] Trial 8 pruned. 


Eval num_timesteps=10000, episode_reward=110.90 +/- 130.51
Episode length: 110.90 +/- 130.51
New best mean reward!
Eval num_timesteps=10000, episode_reward=211.20 +/- 97.51
Episode length: 211.20 +/- 97.51
New best mean reward!


[I 2024-09-07 08:28:11,318] Trial 9 finished with value: 378.1 and parameters: {'gamma': 0.012915341503684086, 'max_grad_norm': 0.3223361371485566, 'exponent_n_steps': 3, 'learning_rate': 0.0004268717928057358, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=378.10 +/- 115.60
Episode length: 378.10 +/- 115.60
New best mean reward!


[I 2024-09-07 08:28:26,863] Trial 10 pruned. 


Eval num_timesteps=10000, episode_reward=140.30 +/- 62.01
Episode length: 140.30 +/- 62.01
New best mean reward!
Eval num_timesteps=10000, episode_reward=481.20 +/- 38.12
Episode length: 481.20 +/- 38.12
New best mean reward!


[I 2024-09-07 08:29:05,432] Trial 11 finished with value: 500.0 and parameters: {'gamma': 0.0002748046585590291, 'max_grad_norm': 0.8614157766373911, 'exponent_n_steps': 5, 'learning_rate': 0.0007131047461536208, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[I 2024-09-07 08:29:23,672] Trial 12 pruned. 


Eval num_timesteps=10000, episode_reward=10.10 +/- 0.54
Episode length: 10.10 +/- 0.54
New best mean reward!


[I 2024-09-07 08:29:40,758] Trial 13 pruned. 


Eval num_timesteps=10000, episode_reward=152.80 +/- 124.84
Episode length: 152.80 +/- 124.84
New best mean reward!


[I 2024-09-07 08:30:00,117] Trial 14 pruned. 


Eval num_timesteps=10000, episode_reward=80.90 +/- 48.88
Episode length: 80.90 +/- 48.88
New best mean reward!


[I 2024-09-07 08:30:16,599] Trial 15 pruned. 


Eval num_timesteps=10000, episode_reward=8.70 +/- 0.46
Episode length: 8.70 +/- 0.46
New best mean reward!


[I 2024-09-07 08:30:32,747] Trial 16 pruned. 


Eval num_timesteps=10000, episode_reward=112.90 +/- 73.45
Episode length: 112.90 +/- 73.45
New best mean reward!


[I 2024-09-07 08:30:52,169] Trial 17 pruned. 


Eval num_timesteps=10000, episode_reward=62.00 +/- 13.97
Episode length: 62.00 +/- 13.97
New best mean reward!


[I 2024-09-07 08:31:09,150] Trial 18 pruned. 


Eval num_timesteps=10000, episode_reward=9.30 +/- 0.64
Episode length: 9.30 +/- 0.64
New best mean reward!
Eval num_timesteps=10000, episode_reward=424.80 +/- 93.20
Episode length: 424.80 +/- 93.20
New best mean reward!


[I 2024-09-07 08:31:43,711] Trial 19 finished with value: 500.0 and parameters: {'gamma': 0.002342484703867948, 'max_grad_norm': 0.8324955728877629, 'exponent_n_steps': 7, 'learning_rate': 0.003062441686503965, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[I 2024-09-07 08:32:03,906] Trial 20 pruned. 


Eval num_timesteps=10000, episode_reward=9.10 +/- 0.83
Episode length: 9.10 +/- 0.83
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2024-09-07 08:32:38,728] Trial 21 finished with value: 500.0 and parameters: {'gamma': 0.0019846877672503576, 'max_grad_norm': 0.8796798829200836, 'exponent_n_steps': 7, 'learning_rate': 0.004305288400174624, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.
[I 2024-09-07 08:32:54,994] Trial 22 pruned. 


Eval num_timesteps=10000, episode_reward=145.70 +/- 103.80
Episode length: 145.70 +/- 103.80
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[I 2024-09-07 08:33:31,170] Trial 23 finished with value: 500.0 and parameters: {'gamma': 0.00018556101142094903, 'max_grad_norm': 0.7477532466902098, 'exponent_n_steps': 6, 'learning_rate': 0.0030183228850804975, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2024-09-07 08:33:47,435] Trial 24 pruned. 


Eval num_timesteps=10000, episode_reward=130.40 +/- 68.90
Episode length: 130.40 +/- 68.90
New best mean reward!


[I 2024-09-07 08:34:05,694] Trial 25 pruned. 


Eval num_timesteps=10000, episode_reward=374.20 +/- 23.36
Episode length: 374.20 +/- 23.36
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=158.80 +/- 5.67
Episode length: 158.80 +/- 5.67


[I 2024-09-07 08:34:37,516] Trial 26 finished with value: 158.8 and parameters: {'gamma': 0.008631954866350877, 'max_grad_norm': 0.45855397993033653, 'exponent_n_steps': 8, 'learning_rate': 0.012008633926028007, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.
[I 2024-09-07 08:34:53,072] Trial 27 pruned. 


Eval num_timesteps=10000, episode_reward=77.90 +/- 34.82
Episode length: 77.90 +/- 34.82
New best mean reward!
Eval num_timesteps=10000, episode_reward=440.20 +/- 98.67
Episode length: 440.20 +/- 98.67
New best mean reward!


[I 2024-09-07 08:35:28,293] Trial 28 finished with value: 500.0 and parameters: {'gamma': 0.00023701804425601224, 'max_grad_norm': 0.718722608380954, 'exponent_n_steps': 6, 'learning_rate': 0.0006401890223613247, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[I 2024-09-07 08:35:44,815] Trial 29 pruned. 


Eval num_timesteps=10000, episode_reward=9.70 +/- 0.90
Episode length: 9.70 +/- 0.90
New best mean reward!


[I 2024-09-07 08:36:02,319] Trial 30 pruned. 


Eval num_timesteps=10000, episode_reward=390.60 +/- 167.17
Episode length: 390.60 +/- 167.17
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[I 2024-09-07 08:36:37,165] Trial 31 finished with value: 500.0 and parameters: {'gamma': 0.0016188820195849945, 'max_grad_norm': 0.878520678804035, 'exponent_n_steps': 7, 'learning_rate': 0.00491866237662306, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2024-09-07 08:36:53,073] Trial 32 pruned. 


Eval num_timesteps=10000, episode_reward=129.30 +/- 7.16
Episode length: 129.30 +/- 7.16
New best mean reward!


[I 2024-09-07 08:37:09,948] Trial 33 pruned. 


Eval num_timesteps=10000, episode_reward=221.60 +/- 12.31
Episode length: 221.60 +/- 12.31
New best mean reward!


[I 2024-09-07 08:37:27,111] Trial 34 pruned. 


Eval num_timesteps=10000, episode_reward=242.70 +/- 13.40
Episode length: 242.70 +/- 13.40
New best mean reward!


[I 2024-09-07 08:37:42,434] Trial 35 pruned. 


Eval num_timesteps=10000, episode_reward=71.80 +/- 26.64
Episode length: 71.80 +/- 26.64
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=428.20 +/- 79.45
Episode length: 428.20 +/- 79.45


[I 2024-09-07 08:38:14,556] Trial 36 finished with value: 428.2 and parameters: {'gamma': 0.009145735594131654, 'max_grad_norm': 1.4340150367351903, 'exponent_n_steps': 9, 'learning_rate': 0.0023642271037146146, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 3 with value: 500.0.


Number of finished trials: 37
Best trial:
  Value: 500.0
  Params: 
    gamma: 0.0077638455987499
    max_grad_norm: 4.903937907385273
    exponent_n_steps: 7
    learning_rate: 0.004403423670865376
    net_arch: small
    activation_fn: tanh
  User attrs:
    gamma: 0.9922361544012501
    n_steps: 128
