# Car Racing

In [1]:
# # General
import platform
assert platform.python_version() == "3.10.14"
# !pip install 'gymnasium[box2d]'
# !pip install 'syne-tune[basic]'
# !pip install 'stable-baselines3[extra]'

In [2]:
import gymnasium as gym
import matplotlib.pyplot as plt
from syne_tune import Tuner
from syne_tune.backend import PythonBackend
from syne_tune.experiments import load_experiment
from syne_tune.config_space import loguniform, uniform, choice
from syne_tune.optimizer.baselines import ASHA
from syne_tune.stopping_criterion import StoppingCriterion

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml


sagemaker.config INFO - Not applying SDK defaults from location: /home/adavidho/.config/sagemaker/config.yaml


In [3]:
# Define the hyperparameter search space
config_space = {
    "learning_rate": loguniform(1e-8, 0.1),
    "tau":  loguniform(1e-8, 1),
    "gamma": uniform(0.9, 0.999),    
    "steps": 100000 # TODO increase
}

In [4]:
# Define the tuning function
def train_hpo_model(learning_rate: float, tau: float, gamma: float, steps: int):
    # Worker imports
    import numpy as np
    from stable_baselines3.common.env_util import make_vec_env
    from stable_baselines3.common.evaluation import evaluate_policy
    from stable_baselines3.common.callbacks import BaseCallback
    from stable_baselines3.common.noise import NormalActionNoise
    from stable_baselines3 import TD3
    
    from syne_tune import Reporter

    # Create the vectorized environment
    env_id = "CarRacing-v2"
    vec_env = make_vec_env(env_id, n_envs=4)
    
    # Initialize the PPO agent with the given hyperparameters
    n_actions = vec_env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))  
    model = TD3("CnnPolicy", vec_env,  
                action_noise=action_noise,
                learning_rate=learning_rate,
                tau=tau,
                gamma=gamma,
                batch_size=32,
                verbose=1)

    report = Reporter()
    class WorkerCallback(BaseCallback):
        def _on_step(self) -> bool:
            # Log the mean reward
            mean_reward = sum(self.locals["rewards"]) / len(self.locals["rewards"])
            step = self.locals["num_collected_steps"]
            report(step=step, mean_reward=mean_reward)
            return True 
    
    # Train the agent
    worker_callback = WorkerCallback()
    model.learn(total_timesteps=steps, callback=worker_callback)

In [5]:
metric = "mean_reward"
scheduler = ASHA(
    config_space,
    metric=metric,
    max_resource_attr="steps",
    resource_attr="step",
    mode="max",
)
trial_backend = PythonBackend(
    tune_function=train_hpo_model, config_space=config_space
)
stop_criterion = StoppingCriterion(
    max_wallclock_time=1800, 
)
tuner = Tuner(
    trial_backend=trial_backend,
    scheduler=scheduler,
    stop_criterion=stop_criterion,
    n_workers=8,
    save_tuner=False,
    wait_trial_completion_when_stopping=True,
)

In [6]:
# Start hyperparameter tuning
tuner.run()

--------------------
Resource summary (last result is reported):
 trial_id    status  iter  learning_rate          tau    gamma  steps  step  mean_reward  worker-time
        0 Completed 25000   3.162278e-05 1.000000e-04 0.949500 100000     1    -0.100000  1534.029914
        1 Completed 25000   2.608652e-02 4.459980e-06 0.994957 100000     1    -0.100000  1483.139701
        2   Stopped    26   9.665928e-02 4.096052e-02 0.985687 100000     1     0.744595     1.622826
        3   Stopped    26   2.024949e-07 1.190156e-06 0.974795 100000     1    -0.100000     1.625575
        4 Completed 25000   9.361879e-07 2.460509e-03 0.950723 100000     1    -0.100000  1616.216955
        5 Completed 25000   3.572324e-07 4.242431e-04 0.929951 100000     1    -0.100000  1535.470439
        6   Stopped    26   2.245840e-02 4.473014e-08 0.937820 100000     1    -0.100000     1.642087
        7   Stopped    26   1.140124e-04 1.346816e-07 0.983634 100000     1    -0.100000     1.640366
        8 Complet

In [7]:
# Get results
tuner_path = tuner.tuner_path
tuning_experiment = load_experiment(tuner_path)
tuning_experiment.results.to_csv("tuning_results.csv")
tuning_experiment.best_config()


{'step': 1,
 'mean_reward': 7.686551213264465,
 'trial_id': 30,
 'config_learning_rate': 0.0006870666329035,
 'config_tau': 1.1701360615400289e-08,
 'config_gamma': 0.967562143466215,
 'config_steps': 100000}