### CX Calibration with HPO under the new code architecture / workflow (DEC 2023)

In [11]:
import sys
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
import time
import yaml
import pickle
import optuna
module_path = os.path.abspath(os.path.join('/Users/lukasvoss/Documents/Master Wirtschaftsphysik/Masterarbeit Yale-NUS CQT/Quantum_Optimal_Control'))
if module_path not in sys.path:
    sys.path.append(module_path)

from quantumenvironment import QuantumEnvironment
from agent import Agent
from gate_level_abstraction import gate_q_env_config
from helper_functions import load_agent_from_yaml_file
from ppo import make_train_ppo
from qconfig import QEnvConfig

import logging
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s INFO %(message)s", # hardcoded INFO level
    datefmt="%Y-%m-%d %H:%M:%S",
    stream=sys.stdout,
)

In [None]:
type(gate_q_env_config)

In [None]:
gate_q_env_config.action_space

In [None]:
# from gymnasium.wrappers.rescale_action import RescaleAction

# min_action = 0
# max_action = 1
# wrapped_env = RescaleAction(q_env, min_action=min_action, max_action=max_action)

### Import HPO Params from YAML file

In [None]:
agent_config = load_agent_from_yaml_file('agent_config.yaml')

In [None]:
ppo_params, network_config, hpo_config = load_agent_from_yaml_file('agent_config.yaml')

In [None]:
ppo_params

In [None]:
network_config

In [None]:
hpo_config

In [None]:
q_env = QuantumEnvironment(gate_q_env_config)
q_env
q_env.parameters.params
q_env.circuit_truncations[0].draw('mpl')

In [15]:
class HyperparameterOptimizer:
    def __init__(
            self, 
            gate_q_env_config: QEnvConfig, 
            path_agent_config: str, 
            save_results_path: str, 
            log_progress: bool = True
        ):
        self.gate_q_env_config = gate_q_env_config
        self.q_env = QuantumEnvironment(self.gate_q_env_config)
        self.ppo_params, self.network_config, self.hpo_config = load_agent_from_yaml_file(path_agent_config)
        self.save_results_path = save_results_path
        self.log_progress = log_progress
        
    def objective(self, trial):
        # Fetch hyperparameters from the trial object
        hpo_config = self.hpo_config
        network_config = self.network_config
        ppo_params = self.ppo_params
        agent_config = {
        'N_UPDATES': trial.suggest_int('N_UPDATES', hpo_config['n_updates'][0], hpo_config['n_updates'][1]),
        'N_EPOCHS': trial.suggest_int('N_EPOCHS', hpo_config['n_epochs'][0], hpo_config['n_epochs'][1]),
        'MINIBATCH_SIZE': trial.suggest_categorical('MINIBATCH_SIZE', hpo_config['minibatch_size']),
        'BATCHSIZE_MULTIPLIER': trial.suggest_int('BATCHSIZE_MULTIPLIER', hpo_config['batchsize_multiplier'][0], hpo_config['batchsize_multiplier'][1]),
        'LR': trial.suggest_float('LR', hpo_config['learning_rate'][0], hpo_config['learning_rate'][1], log=True),
        'GAMMA': trial.suggest_float('GAMMA', hpo_config['gamma'][0], hpo_config['gamma'][1]),
        'GAE_LAMBDA': trial.suggest_float('GAE_LAMBDA', hpo_config['gae_lambda'][0], hpo_config['gae_lambda'][1]),
        'ENT_COEF': trial.suggest_float('ENT_COEF', hpo_config['ent_coef'][0], hpo_config['ent_coef'][1]),
        'V_COEF': trial.suggest_float('V_COEF', hpo_config['v_coef'][0], hpo_config['v_coef'][1]),
        'GRADIENT_CLIP': trial.suggest_float('GRADIENT_CLIP', hpo_config['max_grad_norm'][0], hpo_config['max_grad_norm'][1]),
        'CLIP_VALUE_COEF': trial.suggest_float('CLIP_VALUE_COEF', hpo_config['clip_value_coef'][0], hpo_config['clip_value_coef'][1]),
        'CLIP_RATIO': trial.suggest_float('CLIP_RATIO', hpo_config['clip_ratio'][0], hpo_config['clip_ratio'][1]),
        }

        # Allowing for a range of possible batchsize / minibatch_size combinations by ensuring that batchsize is a multiple of minibatch_size
        agent_config['BATCHSIZE'] = agent_config['MINIBATCH_SIZE'] * agent_config['BATCHSIZE_MULTIPLIER']
        
        agent_config['CLIP_VALUE_LOSS'] = hpo_config['clip_value_loss']

        # Add network-specific hyperparameters that are not part of HPO scope

        agent_config['OPTIMIZER'] = network_config['optimizer']
        agent_config['N_UNITS'] = network_config['n_units']
        agent_config['ACTIVATION'] = network_config['activation']
        agent_config['INCLUDE_CRITIC'] = network_config['include_critic']
        agent_config['NORMALIZE_ADVANTAGE'] = network_config['normalize_advantage']

        agent_config['RUN_NAME'] = ppo_params['run_name']

        self.q_env = QuantumEnvironment(self.gate_q_env_config)
        q_env = self.q_env
        q_env.batch_size = agent_config['BATCHSIZE'] # Overwrite the batch_size of the environment with the one from the agent_config

        train_fn = make_train_ppo(agent_config, q_env)
        training_results = train_fn(total_updates=agent_config['N_UPDATES'], print_debug=True, num_prints=50)

        # Save the action vector associated with this trial's fidelity for future retrieval
        trial.set_user_attr('action_vector', training_results['action_vector'])

        # Use a relevant metric from training_results as the return value
        last_ten_percent = int(0.1 * agent_config['N_UPDATES'])
        
        return training_results['avg_return'][-last_ten_percent]  # Return a metric to minimize or maximize

    def save_best_configuration(self):
        if self.best_trial is not None:
            best_config = {
                'parameters': self.best_trial.params,
                'action_vector': self.best_trial.user_attrs['action_vector']
            }

            # Extract directory from the path
            # directory = os.path.dirname(o,self.save_results_path)
            # # Create directory if it does not exist
            # if not os.path.exists(directory):
            #     os.makedirs(directory)

            pickle_file_name = os.path.join(self.save_results_path, f'reward_{round(self.best_trial.value, 6)}.pickle')
            logging.info('{}'.format(pickle_file_name))
            with open(pickle_file_name, 'wb') as handle:
                pickle.dump(best_config, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print(f"Best configuration saved to {pickle_file_name}")
        else:
            print("No best trial data to save.")
    
    def logging_progress(self, study, start_time):
        logging.warning('---------------- FINISHED HPO ----------------')
        logging.warning('HPO completed in {} seconds.'.format(round(time.time() - start_time, 2)))
        logging.warning("Best trial:")
        logging.warning("-------------------------")
        logging.warning("  Value: {}".format(study.best_trial.value))
        logging.warning("  Parameters: ")
        for key, value in study.best_trial.params.items():
            logging.warning("    {}: {}".format(key, value))

        best_action_vector = study.best_trial.user_attrs['action_vector']
        logging.warning('The best action vector is {}'.format(best_action_vector))
    
    def optimize_hyperparameters(self):
        start_time = time.time()
        num_HPO_trials = self.hpo_config['num_trials']
        logging.warning('num_HPO_trials: {}'.format(num_HPO_trials))
        logging.warning('---------------- STARTING HPO ----------------')

        study = optuna.create_study(direction="maximize")
        study.optimize(self.objective, n_trials=num_HPO_trials)
        
        if self.log_progress:
            self.logging_progress(study, start_time)

        self.best_trial = study.best_trial
        self.save_best_configuration()
    

    @property
    def num_hpo_trials(self):
        return self.hpo_config.get('num_trials', 0)

In [16]:
current_working_directory = os.getcwd()
print("Current Working Directory:", current_working_directory)

Current Working Directory: /Users/lukasvoss/Documents/Master Wirtschaftsphysik/Masterarbeit Yale-NUS CQT/Quantum_Optimal_Control/gate_level_abstraction/hpo


In [17]:
path_agent_config = 'agent_config.yaml'
save_results_path = 'hpo_results'
optimizer = HyperparameterOptimizer(gate_q_env_config, path_agent_config, save_results_path)
optimizer.optimize_hyperparameters()

 75%|███████▍  | 50/67 [00:07<00:02,  6.66it/s]

Fidelity History: []
SparsePauliOp(['II', 'IZ', 'ZI', 'ZZ'],
              coeffs=[ 0.25+0.j, -0.25+0.j,  0.25+0.j, -0.25+0.j])


 76%|███████▌  | 51/67 [00:08<00:02,  6.33it/s]

mean tensor([ 0.0358,  0.0685, -0.1408, -0.0609, -0.0307, -0.1166, -0.0023])
Average return: 0.9708470901376642
DFE Rewards Mean: 0.9708470901376642
DFE Rewards standard dev 0.030232810960040574
Returns Mean: 4.8713217
Returns standard dev 3.1871753
Advantages Mean: 1.1966571
Advantages standard dev 3.1871753
Fidelity History: []
SparsePauliOp(['II', 'XX', 'YY', 'ZZ'],
              coeffs=[ 0.25+0.j,  0.25+0.j,  0.25+0.j, -0.25+0.j])


 78%|███████▊  | 52/67 [00:08<00:02,  6.35it/s]

mean tensor([ 0.0399,  0.0854, -0.1474, -0.0383, -0.0293, -0.0796, -0.0339])
Average return: 0.9608132418453466
DFE Rewards Mean: 0.9608132418453466
DFE Rewards standard dev 0.040541789155987135
Returns Mean: 4.501082
Returns standard dev 3.1407716
Advantages Mean: 0.36516443
Advantages standard dev 3.1407719
Fidelity History: []
SparsePauliOp(['II', 'IZ', 'ZI', 'ZZ'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


 79%|███████▉  | 53/67 [00:08<00:02,  6.42it/s]

mean tensor([ 0.0273,  0.0734, -0.1154, -0.0238, -0.0052, -0.0639, -0.0223])
Average return: 0.979279196338172
DFE Rewards Mean: 0.979279196338172
DFE Rewards standard dev 0.027818928627310162
Returns Mean: 5.7910767
Returns standard dev 3.8715916
Advantages Mean: 1.916023
Advantages standard dev 3.8715916
Fidelity History: []
SparsePauliOp(['II', 'IY', 'XI', 'XY'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


 81%|████████  | 54/67 [00:08<00:02,  6.48it/s]

mean tensor([ 0.0304,  0.0988, -0.1304, -0.0141, -0.0085, -0.0939, -0.0223])
Average return: 0.9552791256406263
DFE Rewards Mean: 0.9552791256406263
DFE Rewards standard dev 0.03514525591483814
Returns Mean: 4.0111156
Returns standard dev 2.6666675
Advantages Mean: -0.76036906
Advantages standard dev 2.6666677
Fidelity History: []
SparsePauliOp(['II', 'XY', 'YZ', 'ZX'],
              coeffs=[ 0.25+0.j,  0.25+0.j,  0.25+0.j, -0.25+0.j])


 82%|████████▏ | 55/67 [00:08<00:01,  6.50it/s]

mean tensor([ 0.0307,  0.1126, -0.1482, -0.0240,  0.0147, -0.0578,  0.0129])
Average return: 0.9725133292784167
DFE Rewards Mean: 0.9725133292784167
DFE Rewards standard dev 0.021175960876391002
Returns Mean: 4.563728
Returns standard dev 2.8883693
Advantages Mean: 0.023745475
Advantages standard dev 2.8883696
Fidelity History: []
SparsePauliOp(['II', 'IX', 'XI', 'XX'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


 84%|████████▎ | 56/67 [00:08<00:01,  6.49it/s]

mean tensor([-0.0494,  0.0960, -0.1311, -0.0364,  0.0244, -0.0025,  0.0030])
Average return: 0.975607476399118
DFE Rewards Mean: 0.975607476399118
DFE Rewards standard dev 0.03368855785507707
Returns Mean: 5.6759205
Returns standard dev 3.8495462
Advantages Mean: 0.96070737
Advantages standard dev 3.8495462
Fidelity History: []
SparsePauliOp(['II', 'IZ', 'XI', 'XZ'],
              coeffs=[ 0.25+0.j, -0.25+0.j,  0.25+0.j, -0.25+0.j])


 85%|████████▌ | 57/67 [00:09<00:01,  6.54it/s]

mean tensor([ 0.0303,  0.1277,  0.0268,  0.0128,  0.0321,  0.0643, -0.0943])
Average return: 0.9734852798236404
DFE Rewards Mean: 0.9734852798236404
DFE Rewards standard dev 0.02600732056261439
Returns Mean: 5.3287716
Returns standard dev 3.8104749
Advantages Mean: 0.5466284
Advantages standard dev 3.810475
Fidelity History: []
SparsePauliOp(['II', 'IZ', 'ZI', 'ZZ'],
              coeffs=[ 0.25+0.j, -0.25+0.j, -0.25+0.j,  0.25+0.j])


 87%|████████▋ | 58/67 [00:09<00:01,  6.58it/s]

mean tensor([ 0.0261,  0.1431, -0.0147, -0.0112,  0.0493,  0.0516, -0.0847])
Average return: 0.9610550265916158
DFE Rewards Mean: 0.9610550265916158
DFE Rewards standard dev 0.03854370657479181
Returns Mean: 4.462771
Returns standard dev 3.0902038
Advantages Mean: -0.46956384
Advantages standard dev 3.0902035
Fidelity History: []
SparsePauliOp(['II', 'XY', 'YX', 'ZZ'],
              coeffs=[ 0.25+0.j,  0.25+0.j, -0.25+0.j, -0.25+0.j])


 88%|████████▊ | 59/67 [00:09<00:01,  6.59it/s]

mean tensor([ 0.0228,  0.1809, -0.0020, -0.0222,  0.0186,  0.0634, -0.0711])
Average return: 0.880691077384737
DFE Rewards Mean: 0.880691077384737
DFE Rewards standard dev 0.07900731416076825
Returns Mean: 2.5747485
Returns standard dev 1.7426993
Advantages Mean: -2.4190195
Advantages standard dev 1.7426994
Fidelity History: []
SparsePauliOp(['II', 'IX', 'XI', 'XX'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


 90%|████████▉ | 60/67 [00:09<00:01,  6.56it/s]

mean tensor([ 0.0325,  0.1604, -0.0292, -0.0407,  0.0111,  0.0577, -0.0488])
Average return: 0.9472654283653918
DFE Rewards Mean: 0.9472654283653918
DFE Rewards standard dev 0.054698542720350685
Returns Mean: 4.337758
Returns standard dev 3.323619
Advantages Mean: -0.21431923
Advantages standard dev 3.323619
Fidelity History: []
SparsePauliOp(['II', 'IZ', 'ZI', 'ZZ'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


 91%|█████████ | 61/67 [00:09<00:00,  6.60it/s]

mean tensor([ 0.0170,  0.1224, -0.0545, -0.0453,  0.0040,  0.0312, -0.0589])
Average return: 0.9535456779489421
DFE Rewards Mean: 0.9535456779489421
DFE Rewards standard dev 0.044185548536247725
Returns Mean: 4.5042276
Returns standard dev 3.4539268
Advantages Mean: 0.4885023
Advantages standard dev 3.4539266
Fidelity History: []
SparsePauliOp(['II', 'IZ', 'XI', 'XZ'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


 93%|█████████▎| 62/67 [00:09<00:00,  6.62it/s]

mean tensor([ 0.0070,  0.1339, -0.0293, -0.0100,  0.0121,  0.0337, -0.0517])
Average return: 0.9830354924851415
DFE Rewards Mean: 0.9830354924851415
DFE Rewards standard dev 0.021637822144674638
Returns Mean: 5.8397827
Returns standard dev 3.556098
Advantages Mean: 1.4707433
Advantages standard dev 3.556098
Fidelity History: []
SparsePauliOp(['II', 'IX', 'XI', 'XX'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


 94%|█████████▍| 63/67 [00:09<00:00,  6.35it/s]

mean tensor([ 0.0082,  0.1489, -0.0460, -0.0194, -0.0107,  0.0238, -0.0559])
Average return: 0.976409519830648
DFE Rewards Mean: 0.976409519830648
DFE Rewards standard dev 0.033497593150147005
Returns Mean: 5.5995097
Returns standard dev 3.6163237
Advantages Mean: 0.6678623
Advantages standard dev 3.6163242
Fidelity History: []
SparsePauliOp(['II', 'XX', 'YY', 'ZZ'],
              coeffs=[ 0.25+0.j,  0.25+0.j, -0.25+0.j,  0.25+0.j])


 96%|█████████▌| 64/67 [00:10<00:00,  6.45it/s]

mean tensor([-0.0293,  0.1370, -0.0378,  0.0024,  0.0143,  0.0326, -0.0225])
Average return: 0.9496948215866089
DFE Rewards Mean: 0.9496948215866089
DFE Rewards standard dev 0.04962113634212455
Returns Mean: 4.1823688
Returns standard dev 3.0738745
Advantages Mean: -0.9704109
Advantages standard dev 3.0738745
Fidelity History: []
SparsePauliOp(['II', 'IZ', 'ZI', 'ZZ'],
              coeffs=[ 0.25+0.j, -0.25+0.j, -0.25+0.j,  0.25+0.j])


 97%|█████████▋| 65/67 [00:10<00:00,  6.52it/s]

mean tensor([-0.0185,  0.0906, -0.0867, -0.0057,  0.0101,  0.0391, -0.0086])
Average return: 0.9793269724417883
DFE Rewards Mean: 0.9793269724417883
DFE Rewards standard dev 0.02828989468555505
Returns Mean: 6.1164618
Returns standard dev 4.1362576
Advantages Mean: 1.3727612
Advantages standard dev 4.1362576
Fidelity History: []
SparsePauliOp(['II', 'XY', 'YX', 'ZZ'],
              coeffs=[ 0.25+0.j,  0.25+0.j, -0.25+0.j, -0.25+0.j])


 99%|█████████▊| 66/67 [00:10<00:00,  6.56it/s]

mean tensor([-2.1119e-05,  1.0753e-01, -6.5102e-02, -1.1849e-02, -7.4037e-03,
         3.5504e-02, -1.6783e-02])
Average return: 0.976706157203956
DFE Rewards Mean: 0.976706157203956
DFE Rewards standard dev 0.027133439914832358
Returns Mean: 5.786959
Returns standard dev 4.010071
Advantages Mean: 0.45375866
Advantages standard dev 4.0100713
Fidelity History: []
SparsePauliOp(['II', 'IZ', 'XI', 'XZ'],
              coeffs=[0.25+0.j, 0.25+0.j, 0.25+0.j, 0.25+0.j])


100%|██████████| 67/67 [00:10<00:00,  6.33it/s]
[I 2023-12-22 13:30:35,071] Trial 0 finished with value: 0.9830354924851415 and parameters: {'N_UPDATES': 67, 'N_EPOCHS': 17, 'MINIBATCH_SIZE': 24, 'BATCHSIZE_MULTIPLIER': 9, 'LR': 0.0025641401686684593, 'GAMMA': 0.9826685030392163, 'GAE_LAMBDA': 0.9877962305036907, 'ENT_COEF': 0.00031484705636878425, 'V_COEF': 0.7135297842555248, 'GRADIENT_CLIP': 0.8557258195515357, 'CLIP_VALUE_COEF': 0.279946989279446, 'CLIP_RATIO': 0.17637343112413612}. Best is trial 0 with value: 0.9830354924851415.


mean tensor([-0.0016,  0.0736, -0.0735,  0.0160, -0.0100,  0.0486, -0.0016])
Average return: 0.9611420185251704
DFE Rewards Mean: 0.9611420185251704
DFE Rewards standard dev 0.04745229069815516
Returns Mean: 4.691361
Returns standard dev 3.1894784
Advantages Mean: -0.44823703
Advantages standard dev 3.1894786
Fidelity History: []
2023-12-22 13:30:35 INFO ---------------- FINISHED HPO ----------------
2023-12-22 13:30:35 INFO HPO completed in 10.65 seconds.
2023-12-22 13:30:35 INFO Best trial:
2023-12-22 13:30:35 INFO -------------------------
2023-12-22 13:30:35 INFO   Value: 0.9830354924851415
2023-12-22 13:30:35 INFO   Parameters: 
2023-12-22 13:30:35 INFO     N_UPDATES: 67
2023-12-22 13:30:35 INFO     N_EPOCHS: 17
2023-12-22 13:30:35 INFO     MINIBATCH_SIZE: 24
2023-12-22 13:30:35 INFO     BATCHSIZE_MULTIPLIER: 9
2023-12-22 13:30:35 INFO     LR: 0.0025641401686684593
2023-12-22 13:30:35 INFO     GAMMA: 0.9826685030392163
2023-12-22 13:30:35 INFO     GAE_LAMBDA: 0.9877962305036907
20

In [None]:
optimizer.q_env

In [None]:
def objective(trial):
    # Fetch hyperparameters from the trial object

    agent_config = {
        'N_UPDATES': trial.suggest_int('N_UPDATES', hpo_config['n_updates'][0], hpo_config['n_updates'][1]),
        'N_EPOCHS': trial.suggest_int('N_EPOCHS', hpo_config['n_epochs'][0], hpo_config['n_epochs'][1]),
        'MINIBATCH_SIZE': trial.suggest_categorical('MINIBATCH_SIZE', hpo_config['minibatch_size']),
        'BATCHSIZE_MULTIPLIER': trial.suggest_int('BATCHSIZE_MULTIPLIER', hpo_config['batchsize_multiplier'][0], hpo_config['batchsize_multiplier'][1]),
        'LR': trial.suggest_float('LR', hpo_config['learning_rate'][0], hpo_config['learning_rate'][1], log=True),
        'GAMMA': trial.suggest_float('GAMMA', hpo_config['gamma'][0], hpo_config['gamma'][1]),
        'GAE_LAMBDA': trial.suggest_float('GAE_LAMBDA', hpo_config['gae_lambda'][0], hpo_config['gae_lambda'][1]),
        'ENT_COEF': trial.suggest_float('ENT_COEF', hpo_config['ent_coef'][0], hpo_config['ent_coef'][1]),
        'V_COEF': trial.suggest_float('V_COEF', hpo_config['v_coef'][0], hpo_config['v_coef'][1]),
        'GRADIENT_CLIP': trial.suggest_float('GRADIENT_CLIP', hpo_config['max_grad_norm'][0], hpo_config['max_grad_norm'][1]),
        'CLIP_VALUE_COEF': trial.suggest_float('CLIP_VALUE_COEF', hpo_config['clip_value_coef'][0], hpo_config['clip_value_coef'][1]),
        'CLIP_RATIO': trial.suggest_float('CLIP_RATIO', hpo_config['clip_ratio'][0], hpo_config['clip_ratio'][1]),
    }

    # Allowing for a range of possible batchsize / minibatch_size combinations by ensuring that batchsize is a multiple of minibatch_size
    agent_config['BATCHSIZE'] = agent_config['MINIBATCH_SIZE'] * agent_config['BATCHSIZE_MULTIPLIER']
    
    agent_config['CLIP_VALUE_LOSS'] = hpo_config['clip_value_loss']

    # Add network-specific hyperparameters that are not part of HPO scope
    agent_config['OPTIMIZER'] = network_config['optimizer']
    agent_config['N_UNITS'] = network_config['n_units']
    agent_config['ACTIVATION'] = network_config['activation']
    agent_config['INCLUDE_CRITIC'] = network_config['include_critic']
    agent_config['NORMALIZE_ADVANTAGE'] = network_config['normalize_advantage']

    agent_config['RUN_NAME'] = ppo_params['run_name']


    q_env = QuantumEnvironment(gate_q_env_config)  # Initialize your environment
    q_env.batch_size = agent_config['BATCHSIZE'] # Overwrite the batch_size of the environment with the one from the agent_config

    train_fn = make_train_ppo(agent_config, q_env)
    training_results = train_fn(total_updates=agent_config['N_UPDATES'], print_debug=True, num_prints=50)

    # Save the action vector associated with this trial's fidelity for future retrieval
    trial.set_user_attr('action_vector', training_results['action_vector'])

    # Use a relevant metric from training_results as the return value
    last_ten_percent = int(0.1 * agent_config['N_UPDATES'])
    
    return training_results['avg_return'][-last_ten_percent]  # Return a metric to minimize or maximize

In [None]:
import optuna

num_HPO_trials = hpo_config['num_trials']
print('num_HPO_trials: ', num_HPO_trials)

def optimize_hyperparameters():
    study = optuna.create_study(direction="maximize")  # or 'minimize' depending on your metric
    study.optimize(objective, n_trials=num_HPO_trials)  # Number of hyperparameter configurations to try

    print("Best trial:")
    best_trial = study.best_trial

    print("  Value: ", best_trial.value)
    print("  Parameters: ")
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}")

    best_action_vector = best_trial.user_attrs['action_vector']
    print(f'The best action vector is: {best_action_vector}')

optimize_hyperparameters()