# Optimize Doraemon pipeline attributes

In [1]:
import optuna
from stable_baselines3.common.evaluation import evaluate_policy
import gym
from doraemon.doraemon import DoraemonCallback
from env.custom_hopper import *
from PPO.ppo_utils import create_agent, train

# basic RL pipeline attributes
n_trials = 50
n_episodes = 1000
mean_timestep = 100
n_eval_episodes = 10

# optimize ppo attributes
optimized_clip_range = 0.19877024509129543
optimized_learning_rate = 0.0008
optimized_gamma = 0.992


def optimize_doraemon_call(clip_range, learning_rate, gamma,
                           epsilon, step, delta, alpha,
                           n_episodes, n_eval_episodes,
                           env_train="CustomHopper-source-v0",
                           env_test="CustomHopper-target-v0"):
  
   # Re-create agent for evaluation (uses the same env and parameters)
    agent = create_agent(
        env=env_train,
        clip_range=clip_range,
        learning_rate=learning_rate,
        gamma=gamma,
        verbose=0
    )

    train(
        agent, 
        total_timestep=n_episodes,
        callbacks=[
            DoraemonCallback(agent, epsilon=epsilon, step=step, delta=delta, alpha=alpha, verbose=0)
        ]
    )

    # Evaluate on the test env
    test_env = gym.make(env_test)
    mean_reward, _ = evaluate_policy(agent, test_env, n_eval_episodes=n_eval_episodes)

    return mean_reward


def objective(trial):

    # Doraemon-specific parameters
    epsilon = trial.suggest_float("epsilon", 0.01, 0.1, log=True)
    step = trial.suggest_float("step", 0.5, 10.0, log=True)
    delta = trial.suggest_float("delta", 0.1, 0.9, log=True)
    alpha = trial.suggest_float("alpha", 0.1, 0.9, log=True)

    total_reward = optimize_doraemon_call(
        optimized_clip_range,
        optimized_learning_rate,
        optimized_gamma,
        epsilon,
        step,
        delta,
        alpha,
        n_episodes*mean_timestep,
        n_eval_episodes
    )

    return total_reward


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials, n_jobs=4, show_progress_bar=True)

print("Best parameters:", study.best_params)
print("Best reward:", study.best_value)

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-16 22:40:09,392] A new study created in memory with name: no-name-e02b8361-1e2f-4b92-8e1a-bce4ebccf52f
Best trial: 3. Best value: 1330.03:   2%|▏         | 1/50 [06:16<5:07:14, 376.21s/it]

[I 2025-06-16 22:46:25,613] Trial 3 finished with value: 1330.0336747288704 and parameters: {'epsilon': 0.07394508925157602, 'step': 0.8723629186151334, 'delta': 0.8134725616069084, 'alpha': 0.6806439714612296}. Best is trial 3 with value: 1330.0336747288704.


Best trial: 3. Best value: 1330.03:   4%|▍         | 2/50 [06:17<2:04:43, 155.91s/it]

[I 2025-06-16 22:46:27,321] Trial 2 finished with value: 1117.0046644926072 and parameters: {'epsilon': 0.02079833549032097, 'step': 1.3204770764483063, 'delta': 0.2587489448333862, 'alpha': 0.8030002202147642}. Best is trial 3 with value: 1330.0336747288704.


Best trial: 1. Best value: 1339.04:   6%|▌         | 3/50 [06:20<1:07:14, 85.85s/it] 

[I 2025-06-16 22:46:29,794] Trial 1 finished with value: 1339.0429557442665 and parameters: {'epsilon': 0.03430705772092706, 'step': 4.1585931666335725, 'delta': 0.8331100284517845, 'alpha': 0.5190432930453589}. Best is trial 1 with value: 1339.0429557442665.
[I 2025-06-16 22:46:29,802] Trial 0 finished with value: 1150.0078242063523 and parameters: {'epsilon': 0.013972701753870104, 'step': 1.8564248560374783, 'delta': 0.17734435404874313, 'alpha': 0.38834987468674365}. Best is trial 1 with value: 1339.0429557442665.


Best trial: 1. Best value: 1339.04:  10%|█         | 5/50 [12:46<1:49:49, 146.43s/it]

[I 2025-06-16 22:52:55,533] Trial 4 finished with value: 996.6579333305359 and parameters: {'epsilon': 0.015313663836964636, 'step': 1.506193952389992, 'delta': 0.34916082719276637, 'alpha': 0.27183815609095463}. Best is trial 1 with value: 1339.0429557442665.


Best trial: 5. Best value: 1526.43:  12%|█▏        | 6/50 [12:51<1:17:41, 105.94s/it]

[I 2025-06-16 22:53:01,327] Trial 5 finished with value: 1526.4266810655595 and parameters: {'epsilon': 0.026055919937607135, 'step': 6.658365120710169, 'delta': 0.19441207703457447, 'alpha': 0.5777633427313199}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  14%|█▍        | 7/50 [12:53<54:07, 75.51s/it]   

[I 2025-06-16 22:53:02,869] Trial 6 finished with value: 927.1918229222298 and parameters: {'epsilon': 0.01053030296987819, 'step': 9.542261582529452, 'delta': 0.5771438636945382, 'alpha': 0.3780721835776021}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  16%|█▌        | 8/50 [12:57<38:14, 54.63s/it]

[I 2025-06-16 22:53:07,350] Trial 7 finished with value: 1478.8047057390213 and parameters: {'epsilon': 0.06465144633794895, 'step': 1.3792557584925333, 'delta': 0.3668696781589774, 'alpha': 0.5254050938358001}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  18%|█▊        | 9/50 [18:42<1:35:53, 140.34s/it]

[I 2025-06-16 22:58:51,774] Trial 8 finished with value: 1155.4317673206328 and parameters: {'epsilon': 0.06617216433829666, 'step': 4.740098012661771, 'delta': 0.24187715092519077, 'alpha': 0.3364635552556878}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  20%|██        | 10/50 [18:44<1:06:14, 99.36s/it]

[I 2025-06-16 22:58:54,160] Trial 9 finished with value: 720.3883848786354 and parameters: {'epsilon': 0.021584496414927674, 'step': 5.151134720365037, 'delta': 0.13153884720257972, 'alpha': 0.14700437346707804}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  22%|██▏       | 11/50 [18:49<46:18, 71.24s/it]  

[I 2025-06-16 22:58:59,131] Trial 10 finished with value: 1401.609149634838 and parameters: {'epsilon': 0.04597792253271396, 'step': 1.549203040900916, 'delta': 0.127894270686455, 'alpha': 0.5272680336842025}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  24%|██▍       | 12/50 [18:53<32:17, 50.99s/it]

[I 2025-06-16 22:59:02,541] Trial 11 finished with value: 1100.4141939759254 and parameters: {'epsilon': 0.06155909859408053, 'step': 4.612822445231096, 'delta': 0.5263426803672757, 'alpha': 0.12681569631210607}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  26%|██▌       | 13/50 [24:26<1:23:29, 135.38s/it]

[I 2025-06-16 23:04:35,803] Trial 12 finished with value: 1462.602402997017 and parameters: {'epsilon': 0.0319470332344826, 'step': 4.009430711997262, 'delta': 0.22104843353025952, 'alpha': 0.3618186199429894}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  28%|██▊       | 14/50 [24:30<57:36, 96.00s/it]   

[I 2025-06-16 23:04:39,592] Trial 13 finished with value: 1334.9715693950652 and parameters: {'epsilon': 0.041375341031173386, 'step': 9.242231972888701, 'delta': 0.10542606417364316, 'alpha': 0.20217853295920074}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  30%|███       | 15/50 [24:34<40:00, 68.57s/it]

[I 2025-06-16 23:04:44,017] Trial 14 finished with value: 1409.2195023417473 and parameters: {'epsilon': 0.04458138188505122, 'step': 0.6049426153638748, 'delta': 0.3956570531465145, 'alpha': 0.24461925447080182}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  32%|███▏      | 16/50 [24:40<28:14, 49.85s/it]

[I 2025-06-16 23:04:50,091] Trial 15 finished with value: 1038.324181842804 and parameters: {'epsilon': 0.03192549971337339, 'step': 0.573376754912968, 'delta': 0.37445675740549106, 'alpha': 0.21467554509556933}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  34%|███▍      | 17/50 [30:34<1:17:36, 141.11s/it]

[I 2025-06-16 23:10:44,394] Trial 16 finished with value: 1180.414913725853 and parameters: {'epsilon': 0.04425828180178186, 'step': 0.552504844191801, 'delta': 0.38226344606453583, 'alpha': 0.2429851749311963}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 5. Best value: 1526.43:  36%|███▌      | 18/50 [30:37<53:03, 99.48s/it]   

[I 2025-06-16 23:10:46,665] Trial 17 finished with value: 1052.3314852952958 and parameters: {'epsilon': 0.09652259736410188, 'step': 0.5398166439618594, 'delta': 0.36640031561028447, 'alpha': 0.551670518705415}. Best is trial 5 with value: 1526.4266810655595.


Best trial: 18. Best value: 1652.35:  38%|███▊      | 19/50 [30:44<37:10, 71.95s/it]

[I 2025-06-16 23:10:54,341] Trial 18 finished with value: 1652.352514833212 and parameters: {'epsilon': 0.09168764322520681, 'step': 2.502828879059888, 'delta': 0.17954957384916176, 'alpha': 0.5429804944757511}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  40%|████      | 20/50 [30:54<26:35, 53.18s/it]

[I 2025-06-16 23:11:03,706] Trial 19 finished with value: 1357.241687965393 and parameters: {'epsilon': 0.08665561608473685, 'step': 2.27703618039633, 'delta': 0.19312952358846638, 'alpha': 0.5645003547885004}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  42%|████▏     | 21/50 [37:02<1:11:18, 147.52s/it]

[I 2025-06-16 23:17:11,419] Trial 20 finished with value: 951.9882815599442 and parameters: {'epsilon': 0.022539143995754407, 'step': 3.050489802195125, 'delta': 0.1865972897330153, 'alpha': 0.5942896352783785}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  44%|████▍     | 22/50 [37:05<48:38, 104.22s/it]  

[I 2025-06-16 23:17:14,587] Trial 21 finished with value: 1511.106207537651 and parameters: {'epsilon': 0.022459810699717732, 'step': 2.791709986549783, 'delta': 0.18793336171494782, 'alpha': 0.8457475551934218}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  46%|████▌     | 23/50 [37:11<33:41, 74.89s/it] 

[I 2025-06-16 23:17:21,011] Trial 22 finished with value: 1053.8276039123534 and parameters: {'epsilon': 0.02342463329089713, 'step': 2.740555797180581, 'delta': 0.17647258989994002, 'alpha': 0.8102812483867149}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  48%|████▊     | 24/50 [37:17<23:32, 54.33s/it]

[I 2025-06-16 23:17:27,383] Trial 23 finished with value: 1523.466378325224 and parameters: {'epsilon': 0.023740385543440094, 'step': 3.025103174814063, 'delta': 0.1451904243073372, 'alpha': 0.8777403233408366}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  50%|█████     | 25/50 [43:48<1:04:39, 155.20s/it]

[I 2025-06-16 23:23:57,943] Trial 24 finished with value: 1132.6688778042794 and parameters: {'epsilon': 0.061029284057943674, 'step': 1.0219790397109347, 'delta': 0.1447858318677514, 'alpha': 0.8582383930762658}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  52%|█████▏    | 26/50 [43:53<44:03, 110.14s/it]  

[I 2025-06-16 23:24:02,949] Trial 25 finished with value: 1177.1104462385179 and parameters: {'epsilon': 0.025459466549463603, 'step': 7.332125307210577, 'delta': 0.14772678903324318, 'alpha': 0.8909406930088961}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  54%|█████▍    | 27/50 [44:02<30:36, 79.85s/it] 

[I 2025-06-16 23:24:12,112] Trial 26 finished with value: 913.4383805513382 and parameters: {'epsilon': 0.017197695724283132, 'step': 6.437149270625138, 'delta': 0.14435351525018966, 'alpha': 0.8851838450252654}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  56%|█████▌    | 28/50 [44:09<21:16, 58.03s/it]

[I 2025-06-16 23:24:19,224] Trial 27 finished with value: 1121.8796617567539 and parameters: {'epsilon': 0.01621054007114924, 'step': 6.94508455676247, 'delta': 0.14606778415094904, 'alpha': 0.43271426159531756}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  58%|█████▊    | 29/50 [51:26<1:00:03, 171.60s/it]

[I 2025-06-16 23:31:35,846] Trial 28 finished with value: 803.5847244501114 and parameters: {'epsilon': 0.026074801916971577, 'step': 6.06028869450143, 'delta': 0.11118126645334438, 'alpha': 0.4352349205114702}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  60%|██████    | 30/50 [51:29<40:21, 121.07s/it]  

[I 2025-06-16 23:31:39,017] Trial 29 finished with value: 632.8463603973389 and parameters: {'epsilon': 0.016841323279136396, 'step': 6.61388710377502, 'delta': 0.10375441195867711, 'alpha': 0.4322646488448229}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  62%|██████▏   | 31/50 [51:39<27:48, 87.80s/it] 

[I 2025-06-16 23:31:49,180] Trial 30 finished with value: 748.6894614100456 and parameters: {'epsilon': 0.027425588457734157, 'step': 3.408456257551214, 'delta': 0.1069953609699228, 'alpha': 0.4448195989157771}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  64%|██████▍   | 32/50 [51:49<19:20, 64.45s/it]

[I 2025-06-16 23:31:59,148] Trial 31 finished with value: 1559.3922566771507 and parameters: {'epsilon': 0.03708043136862962, 'step': 3.365883188324153, 'delta': 0.10851731100824664, 'alpha': 0.6777320437663199}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  66%|██████▌   | 33/50 [58:36<47:20, 167.10s/it]

[I 2025-06-16 23:38:45,762] Trial 32 finished with value: 1221.6006094932557 and parameters: {'epsilon': 0.0121135436419315, 'step': 2.030971420877432, 'delta': 0.2874664108516673, 'alpha': 0.6884274739042898}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  68%|██████▊   | 34/50 [58:42<31:42, 118.93s/it]

[I 2025-06-16 23:38:52,282] Trial 33 finished with value: 819.2337367653847 and parameters: {'epsilon': 0.010870288059199948, 'step': 3.426055836725045, 'delta': 0.2141406968732666, 'alpha': 0.6871300893433795}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  70%|███████   | 35/50 [58:52<21:32, 86.16s/it] 

[I 2025-06-16 23:39:01,980] Trial 34 finished with value: 806.2375757038593 and parameters: {'epsilon': 0.012775063861908903, 'step': 2.279961855423237, 'delta': 0.28791154326289764, 'alpha': 0.6882590567678178}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  72%|███████▏  | 36/50 [59:05<14:57, 64.11s/it]

[I 2025-06-16 23:39:14,656] Trial 35 finished with value: 1322.0194519758224 and parameters: {'epsilon': 0.019184412086748707, 'step': 2.0795492844641967, 'delta': 0.162566621818756, 'alpha': 0.6798362221507247}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  74%|███████▍  | 37/50 [1:05:19<34:01, 157.07s/it]

[I 2025-06-16 23:45:28,627] Trial 36 finished with value: 882.5656228423119 and parameters: {'epsilon': 0.03581660935720011, 'step': 2.3162971327987747, 'delta': 0.1658673590577146, 'alpha': 0.6831785309085049}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  76%|███████▌  | 38/50 [1:05:28<22:31, 112.61s/it]

[I 2025-06-16 23:45:37,496] Trial 37 finished with value: 1434.8681944847108 and parameters: {'epsilon': 0.03733377273830992, 'step': 2.116782769334317, 'delta': 0.16446772430922893, 'alpha': 0.6419840308537903}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  78%|███████▊  | 39/50 [1:05:36<14:54, 81.36s/it] 

[I 2025-06-16 23:45:45,949] Trial 38 finished with value: 945.1485940337182 and parameters: {'epsilon': 0.03591139164428566, 'step': 1.776490743050462, 'delta': 0.12546230758502935, 'alpha': 0.6273509272080006}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  80%|████████  | 40/50 [1:05:51<10:13, 61.31s/it]

[I 2025-06-16 23:46:00,475] Trial 39 finished with value: 1203.5106875419617 and parameters: {'epsilon': 0.037064248176890795, 'step': 1.6939522696825755, 'delta': 0.12223910717599633, 'alpha': 0.630522561300474}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  82%|████████▏ | 41/50 [1:12:17<23:49, 158.87s/it]

[I 2025-06-16 23:52:26,977] Trial 40 finished with value: 1027.6585668683051 and parameters: {'epsilon': 0.052877034327148915, 'step': 3.8296837687749847, 'delta': 0.12230527985108061, 'alpha': 0.6047101936852809}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  84%|████████▍ | 42/50 [1:12:26<15:11, 113.99s/it]

[I 2025-06-16 23:52:36,245] Trial 41 finished with value: 931.3426960229874 and parameters: {'epsilon': 0.049709889434572156, 'step': 1.7586095317242607, 'delta': 0.1224289439052908, 'alpha': 0.4712575528649134}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  86%|████████▌ | 43/50 [1:12:33<09:33, 81.89s/it] 

[I 2025-06-16 23:52:43,222] Trial 42 finished with value: 1007.2190884709358 and parameters: {'epsilon': 0.029146851495110426, 'step': 3.6767375417583676, 'delta': 0.11996788552033155, 'alpha': 0.7592187625030354}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  88%|████████▊ | 44/50 [1:12:50<06:14, 62.44s/it]

[I 2025-06-16 23:53:00,300] Trial 43 finished with value: 1035.1448810338975 and parameters: {'epsilon': 0.02828345416213954, 'step': 3.415057563084007, 'delta': 0.25315795861043866, 'alpha': 0.7453045686938501}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  90%|█████████ | 45/50 [1:19:42<13:56, 167.32s/it]

[I 2025-06-16 23:59:52,322] Trial 44 finished with value: 1461.3749068796635 and parameters: {'epsilon': 0.028981343605394925, 'step': 2.82713082043001, 'delta': 0.20570993972573307, 'alpha': 0.7695195769512277}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  92%|█████████▏| 46/50 [1:19:53<08:00, 120.17s/it]

[I 2025-06-17 00:00:02,489] Trial 45 finished with value: 683.6829125285149 and parameters: {'epsilon': 0.018999951672722545, 'step': 2.771593098761571, 'delta': 0.20880817194927798, 'alpha': 0.782098268619035}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  94%|█████████▍| 47/50 [1:20:01<04:19, 86.60s/it] 

[I 2025-06-17 00:00:10,755] Trial 46 finished with value: 858.2216099262238 and parameters: {'epsilon': 0.018205777271614525, 'step': 2.752717423533241, 'delta': 0.21415431726395423, 'alpha': 0.77289424830926}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  96%|█████████▌| 48/50 [1:20:12<02:08, 64.07s/it]

[I 2025-06-17 00:00:22,264] Trial 47 finished with value: 675.4586905121803 and parameters: {'epsilon': 0.018865125105137912, 'step': 2.752429791687192, 'delta': 0.20745852011466115, 'alpha': 0.48985325743538105}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35:  98%|█████████▊| 49/50 [1:22:16<01:21, 81.99s/it]

[I 2025-06-17 00:02:26,063] Trial 48 finished with value: 1142.5502455115318 and parameters: {'epsilon': 0.019647682154118915, 'step': 5.2859065324328345, 'delta': 0.2202191255211601, 'alpha': 0.311214966298559}. Best is trial 18 with value: 1652.352514833212.


Best trial: 18. Best value: 1652.35: 100%|██████████| 50/50 [1:22:17<00:00, 98.76s/it]

[I 2025-06-17 00:02:27,173] Trial 49 finished with value: 1226.3171590983868 and parameters: {'epsilon': 0.019703037167380637, 'step': 5.178305582279415, 'delta': 0.23309801375934935, 'alpha': 0.4983287516825577}. Best is trial 18 with value: 1652.352514833212.
Best parameters: {'epsilon': 0.09168764322520681, 'step': 2.502828879059888, 'delta': 0.17954957384916176, 'alpha': 0.5429804944757511}
Best reward: 1652.352514833212



