In [2]:
import numpy as np
import pandas as pd
from plotnine import (
    ggplot, aes, geom_density, geom_line, geom_point, 
    geom_violin, facet_grid, labs, theme, facet_wrap,
)
from rl4greencrab import greenCrabEnv, greenCrabSimplifiedEnv
from rl4greencrab import simulator, constAction, multiConstAction, constActionNatUnits, evaluate_agent

In [3]:
def evaluateConstAct(x):
    config = {
        'action_reward_scale': np.array([0.08, 0.08, 0.4]),
        'max_action': 3000,
        # 'env_stoch': 0.,
        'trapm_pmax': 10 * 0.1 * 2.75e-5, #2.26e-6,
        'trapf_pmax': 10 * 0.03 * 2.75e-5, #8.3e-7,
        'traps_pmax': 10 * 2.75e-5, #2.75e-5,

        'loss_a': 0.2,
        'loss_b': 5,
        'loss_c': 5,
        
        'action_reward_exponent': 10,
    }
    env = greenCrabEnv(config=config)
    agent = multiConstAction(env=env, action=np.array(x))
    # rewards = simulator(env, agent).simulate()
    # out = np.mean(rewards)
    m_reward = evaluate_agent(agent=agent, ray_remote=True).evaluate(n_eval_episodes=200)
    print(m_reward)
    return - m_reward


In [4]:
from skopt import gp_minimize, gbrt_minimize 
from skopt.plots import plot_convergence, plot_objective

In [27]:
%%time
max_action = 3000
res = gp_minimize(evaluateConstAct, 3*[(0, max_action)], n_calls = 100, verbose=True)
res.x

Iteration No: 1 started. Evaluating function at random point.
-6.066474588086137e-05
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.6374
Function value obtained: 0.0001
Current minimum: 0.0001
Iteration No: 2 started. Evaluating function at random point.
-1.0205633300538646
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.4241
Function value obtained: 1.0206
Current minimum: 0.0001
Iteration No: 3 started. Evaluating function at random point.
-0.32111677376319536
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.3750
Function value obtained: 0.3211
Current minimum: 0.0001
Iteration No: 4 started. Evaluating function at random point.
-1.67474975740744
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.3850
Function value obtained: 1.6747
Current minimum: 0.0001
Iteration No: 5 started. Evaluating function at random point.
-0.06658162086041877
Iteration No: 5 ended. Evaluation done at random point.
Time ta

[477, 516, 644]

In [5]:
evaluateConstAct(
    [477, 516, 644]
)

2024-09-09 17:09:00,988	INFO worker.py:1781 -- Started a local Ray instance.


-2.849447960823816e-05


2.849447960823816e-05

[33m(raylet)[0m [2024-09-09 17:48:01,022 E 1000 1000] (raylet) node_manager.cc:3064: 2 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: a9efce47756f1d48ccb82e087801be1648bdb15fb6cde57495df9e8c, IP: 10.42.0.70) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 10.42.0.70`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
[33m(raylet)[0m 
[33m(raylet)[0m [2024-09-09 17:50:01,025 E 1000 1000] (raylet) node_manager

In [29]:
act =  np.array([477, 516, 644])
act/1500 - 1


array([-0.682     , -0.656     , -0.57066667])

In [6]:
evaluateConstAct(
    [869.3405896605859, 582.8685243387607, 968.4643115701439]
)

-0.0005273480706757184


0.0005273480706757184

In [None]:
def sample_env_config(trial: optuna.Trial, n_actions: int, n_envs: int, additional_args: dict) -> Dict[str, Any]:
    action_reward_scale = = trial.suggest_categorical(np.array([0.08, 0.08, 0.4]),
    max_action = trial.suggest_categorical(3000)
    # 'env_stoch': 0.,
    trapm_pmax = trial.suggest_categorical()
    trapf_pmax = trial.suggest_categorical()
    traps_pmax = trial.suggest_categorical()

    loss_a = trial.suggest_categorical()
    loss_b = trial.suggest_categorical()
    loss_c = trial.suggest_categorical()
        
    action_reward_exponent = trial.suggest_categorical()

    return {
        'action_reward_scale': action_reward_scale,
        'max_action': max_action,
        # 'env_stoch': 0.,
        'trapm_pmax': trapm_pmax,
        'trapf_pmax': trapf_pmax,
        'traps_pmax': traps_pmax,

        'loss_a': loss_a,
        'loss_b': loss_b,
        'loss_c': loss_c,
        
        'action_reward_exponent': action_reward_exponent,
    }

In [None]:
def env_objective(trial: optuna.Trial)--> float:
    env = greenCrabEnv(config=config)
    simplify_env = greenCrabSimplifiedEnv(config=config)
    
    model = TQC("MlpPolicy", 
            vec_env, 
            verbose=0, 
            gamma= 0.9999, 
            learning_rate = 0.020439420278073966, 
            batch_size = 16, 
            buffer_size = 10000, 
            learning_starts= 0, 
            train_freq = 16, 
            tau = 0.05, 
            top_quantiles_to_drop_per_net = 1)
    
    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except (AssertionError,ValueError) as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True

    