# Car Racing

In [None]:
# # General
import platform
assert platform.python_version() == "3.10.14"
# !pip install 'gymnasium[box2d]'
# !pip install 'syne-tune[basic]'
# !pip install 'stable-baselines3[extra]'

In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt
from syne_tune import Tuner
from syne_tune.backend import PythonBackend
from syne_tune.experiments import load_experiment
from syne_tune.config_space import loguniform, uniform, choice
from syne_tune.optimizer.baselines import ASHA
from syne_tune.stopping_criterion import StoppingCriterion

In [None]:
# Define the hyperparameter search space
config_space = {
    "learning_rate": loguniform(1e-8, 0.1),
    "tau":  loguniform(1e-8, 1),
    "gamma": uniform(0.9, 0.999),    
    "steps": 1000000 # TODO increase
}

In [None]:
# Define the tuning function
def train_hpo_model(learning_rate: float, tau: float, gamma: float, steps: int):
    # Worker imports
    import gymnasium as gym
    import numpy as np
    from stable_baselines3.common.env_util import make_vec_env
    from stable_baselines3.common.evaluation import evaluate_policy
    from stable_baselines3.common.callbacks import BaseCallback
    from stable_baselines3.common.noise import NormalActionNoise
    from stable_baselines3 import TD3
    
    from syne_tune import Reporter
    import torch
    assert torch.cuda.is_available()
    # Create the vectorized environment
    env_id = "CarRacing-v2"
    # vec_env = make_vec_env(env_id, n_envs=4) # TODO why didn't you use vec env? no muli processing
    env = gym.make(env_id, domain_randomize=True)
    
    # Initialize the PPO agent with the given hyperparameters
    n_actions = env.action_space.shape[-1]
    # n_actions = vec_env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))  
    model = TD3(
        "CnnPolicy", 
        # vec_env,  
        env,
        action_noise=action_noise,
        learning_rate=learning_rate,
        tau=tau,
        gamma=gamma,
        batch_size=64, # Tested 32, 64 and finally 256 which failed after a few hours
        verbose=1,
        device="cuda"
    )

    report = Reporter()
    class WorkerCallback(BaseCallback):
        def _on_step(self) -> bool:
            # Log the mean reward
            mean_reward = sum(self.locals["rewards"]) / len(self.locals["rewards"])
            step = self.locals["num_collected_steps"]
            report(step=step, mean_reward=mean_reward)
            return True 
    
    # Train the agent
    worker_callback = WorkerCallback()
    model.learn(total_timesteps=steps, callback=worker_callback, log_interval=5)

In [None]:
metric = "mean_reward"
scheduler = ASHA(
    config_space,
    metric=metric,
    max_resource_attr="steps",
    resource_attr="step",
    mode="max",
)
trial_backend = PythonBackend(
    tune_function=train_hpo_model, config_space=config_space, rotate_gpus=True
)
stop_criterion = StoppingCriterion(
    max_wallclock_time=61200, # 2 hours, first we did 4 days, stuck (see below) to little time left so now only 1 day
)
tuner = Tuner(
    trial_backend=trial_backend,
    scheduler=scheduler,
    stop_criterion=stop_criterion,
    n_workers=8,
    save_tuner=False,
    wait_trial_completion_when_stopping=False, # Problem with termintation, waits until all running jobs are finished. May take a long time with 1M steps  
)

In [None]:
# Start hyperparameter tuning
tuner.run()

In [None]:
# Get results
tuner_path = tuner.tuner_path
tuning_experiment = load_experiment(tuner_path)
tuning_experiment.results.to_csv("tuning_results.csv")
tuning_experiment.best_config()
