# Car Racing

In [1]:
# Not sure if needed !pip install swig
# !pip install 'gymnasium[box2d]'
# !pip install 'stable-baselines3[extra]'
# !pip install 'syne-tune[basic]'

In [6]:
import gymnasium as gym
import matplotlib.pyplot as plt
from syne_tune import Tuner
from syne_tune.backend import PythonBackend
from syne_tune.experiments import load_experiment
from syne_tune.config_space import loguniform, uniform, choice
from syne_tune.optimizer.baselines import ASHA
from syne_tune.stopping_criterion import StoppingCriterion

In [7]:
# Define the hyperparameter search space
config_space = {
    "learning_rate": loguniform(1e-8, 0.1),
    "tau":  loguniform(1e-8, 1),
    "gamma": uniform(0.9, 0.999),    
    "steps": 50 # TODO increase
}

In [17]:
# Define the tuning function
def train_hpo_model(learning_rate: float, tau: float, gamma: float, steps: int):
    # Worker imports
    import numpy as np
    from stable_baselines3.common.env_util import make_vec_env
    from stable_baselines3.common.evaluation import evaluate_policy
    from stable_baselines3.common.callbacks import BaseCallback
    from stable_baselines3.common.noise import NormalActionNoise
    from stable_baselines3 import TD3
    
    from syne_tune import Reporter

    # Create the vectorized environment
    env_id = "CarRacing-v2"
    vec_env = make_vec_env(env_id, n_envs=4)
    
    # Initialize the PPO agent with the given hyperparameters
    n_actions = vec_env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))  
    model = TD3("CnnPolicy", vec_env,  
                action_noise=action_noise,
                learning_rate=learning_rate,
                tau=tau,
                gamma=gamma,
                batch_size=32,
                verbose=1)

    report = Reporter()
    class WorkerCallback(BaseCallback):
        def _on_step(self) -> bool:
            # Log the mean reward
            mean_reward = sum(self.locals["rewards"]) / len(self.locals["rewards"])
            step = self.locals["num_collected_steps"]
            report(step=step, mean_reward=mean_reward, n_step=step + 1)
            return True 
    
    # Train the agent
    worker_callback = WorkerCallback()
    model.learn(total_timesteps=steps, callback=worker_callback)

In [23]:
metric = "mean_reward"
scheduler = ASHA(
    config_space,
    metric=metric,
    max_resource_attr="steps",
    resource_attr="n_step",
    mode="max",
)
trial_backend = PythonBackend(
    tune_function=train_hpo_model, config_space=config_space
)
stop_criterion = StoppingCriterion(
    max_wallclock_time=30, 
    max_num_trials_completed=1000
)
tuner = Tuner(
    trial_backend=trial_backend,
    scheduler=scheduler,
    stop_criterion=stop_criterion,
    n_workers=1,
    save_tuner=False,
    wait_trial_completion_when_stopping=True,
)

In [24]:
# Start hyperparameter tuning
tuner.run()



--------------------
Resource summary (last result is reported):
 trial_id    status  iter  learning_rate          tau    gamma  steps  step  mean_reward  n_step  worker-time
        0 Completed    13   3.162278e-05 1.000000e-04 0.949500     50     1         -0.1       2     0.354981
        1 Completed    13   2.842344e-03 1.168070e-01 0.992298     50     1         -0.1       2     0.350097
        2 Completed    13   3.567599e-02 2.212191e-03 0.954412     50     1         -0.1       2     0.349499
        3 Completed    13   9.283290e-06 4.610815e-04 0.980848     50     1         -0.1       2     0.362427
        4 Completed    13   1.781584e-08 1.169409e-03 0.923024     50     1         -0.1       2     0.369614
        5 Completed    13   1.568845e-07 3.019649e-07 0.934429     50     1         -0.1       2     0.341538
0 trials running, 6 finished (6 until the end), 30.22s wallclock-time

mean_reward: best -0.10000000149011612 for trial-id 0
--------------------


In [21]:
# Get results
tuner_path = tuner.tuner_path
tuning_experiment = load_experiment(tuner_path)
# tuning_experiment.results
tuning_experiment.best_config()


{'step': 1,
 'mean_reward': -0.1000000014901161,
 'n_step': 2,
 'trial_id': 0,
 'config_learning_rate': 3.162277660168375e-05,
 'config_tau': 9.999999999999992e-05,
 'config_gamma': 0.9495,
 'config_steps': 50}