# Soft Actor-Critic (SAC)

# Hyperparameters optimization for reward function using Ray.Tune

In [None]:
# trained in reward_training.py

In [None]:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
from os import path
from scipy.integrate import solve_ivp

import gym_gyroscope_env
import spinup
import torch
from functools import partial
import argparse

from custom_functions.custom_functions import env_fn
from custom_functions.custom_functions import create_env
from custom_functions.custom_functions import load_agent
from custom_functions.custom_functions import test_agent
from custom_functions.custom_functions import plot_test
from custom_functions.custom_functions import evaluate_control

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest import ConcurrencyLimiter
from ray.tune.schedulers import AsyncHyperBandScheduler

In [None]:
def train_reward(config):
    times_test=10
    steps=1

    # Env function
    env_name = 'GyroscopeEnv-v1'   # GyroscopeRealEnv-v0， GyroscopeEnv-v1, GyroscopeIntegralEnv-v1

    for step in range(steps):
        simu_args = {
            'dt': 0.05,
            'ep_len': 100,
            'seed': 2
        }

        reward_func = 'Normalized with bonus'
        reward_args = {
            'k': 1,
            'qx2': 1,
            'qx4': 1,
            'pu1': 0,
            'pu2': 0,
            'bound': 0.05,
            'bonus': 2
        }

        env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)
        print(env_fn_)

        # Baseline 0 training
        spinup.sac_pytorch(env_fn_,
                           ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                           seed=0,
                           steps_per_epoch=1500,
                           epochs=500,
                           replay_size=1000000,
                           gamma=config["gamma"],
                           polyak=config["polyak"],
                           lr=config["lr"],
                           alpha=config["alpha"],   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                           batch_size=config["batch_size"],
                           start_steps=10000,
                           update_after=1000,
                           update_every=50,
                           num_test_episodes=10,
                           max_ep_len=100,
                           logger_kwargs=dict(output_dir='sac_reward_opt', exp_name='sac_reward_opt')
                           )

        # Test paramaters
        init_state = np.array([0,0,0,0,45/180*np.pi,-60/180*np.pi,200/60*2*np.pi])
        env = create_env(env_name,state=init_state)
        print(env)

        agent_paths = ['sac_reward_opt']
        agent = load_agent(agent_paths[0])
        t_end = times_test   # 测试步数

        score, state_record, obs_record, action_record, reward_record = test_agent(env,agent,t_end)
        plot_test(state_record, action_record, t_end, 4)   # 显示所有的测试，更价值观清晰

        tune.report(score)

    print("Finished Training")

In [None]:
import argparse

num_samples=10
max_num_epochs=10
gpus_per_trial=0

config = {
    "gamma": tune.loguniform(1e-4, 1),
    "polyak": tune.loguniform(1e-4, 1),
    "lr": tune.loguniform(1e-4, 1e-1),
    "alpha": tune.loguniform(1e-4, 100),
    "batch_size": tune.choice([64, 128, 516, 1024]),
}

scheduler = ASHAScheduler(
    metric="score",
    mode="max",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2)

reporter = CLIReporter(
    parameter_columns=["gamma", "polyak", "lr", "alpha", "batch_size"],
    metric_columns=["score"]
)

analysis = tune.run(
    partial(train_reward),
    resources_per_trial={"cpu": 8, "gpu": gpus_per_trial},
    config=config,
    num_samples=num_samples,
    scheduler=scheduler,
    progress_reporter=reporter
)

print("Best hyperparameters found were: ", analysis.best_config)


In [None]:
config = {
    "gamma": 0.5,
    "polyak": 0.1,
    "lr": 0.05,
    "alpha": 0.5,
    "batch_size": 1024,
}

train_reward(config, times_test=10, steps=1)
