# Soft Actor-Critic (SAC)

# Model training with quadratic, exponential, and other reward functions on Env-v1

In [1]:
# trained in reward_training.py

In [2]:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
from os import path
from scipy.integrate import solve_ivp

import gym_gyroscope_env
import spinup
import torch
from functools import partial

from custom_functions.custom_functions import env_fn 
from custom_functions.custom_functions import create_env
from custom_functions.custom_functions import load_agent
from custom_functions.custom_functions import test_agent
from custom_functions.custom_functions import plot_test
from custom_functions.custom_functions import evaluate_control

In /home/xiongyan/anaconda3/envs/spinningup/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/xiongyan/anaconda3/envs/spinningup/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/xiongyan/anaconda3/envs/spinningup/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /home/xiongyan/anaconda3/envs/spinningup/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor rele

#### PE reward

In [3]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'PE'
reward_args = {
    'qx1': 1,
    'qx2': 0.2,
    'qx3': 1,
    'qx4': 0.2,
    'pu1': 0.1,
    'pu2': 0.1,
    'p': 0.1,
    'e': 40
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=2000,
                   replay_size=1000000,
                   gamma=0.95,
                   polyak=0.995,
                   lr=0.00125,
                   alpha=0.05,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_pe_opt003', exp_name='sac_pe_opt003')
                   )

[32;1mLogging data to sac_pe_opt003/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.05,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f7fe3b7e840>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='PE', reward_args={'qx1': 1, 'qx2': 0.2, 'qx3': 1, 'qx4': 0.2, 'pu1': 0.1, 'pu2': 0.1, 'p': 0.1, 'e': 40})",
    "epochs":	200,
    "exp_name":	"sac_pe_opt003",
    "gamma":	0.95,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f7fe3b8f198>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_pe_opt003",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_pe_opt003",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_pe_opt003/

---------------------------------------
|             Epoch |               7 |
|      AverageEpRet |            -486 |
|          StdEpRet |            6.93 |
|          MaxEpRet |            -475 |
|          MinEpRet |            -495 |
|  AverageTestEpRet |            -481 |
|      StdTestEpRet |            7.71 |
|      MaxTestEpRet |            -472 |
|      MinTestEpRet |            -495 |
|             EpLen |             100 |
|         TestEpLen |             100 |
| TotalEnvInteracts |        1.05e+04 |
|     AverageQ1Vals |           -86.8 |
|         StdQ1Vals |            1.65 |
|         MaxQ1Vals |           -81.5 |
|         MinQ1Vals |           -93.6 |
|     AverageQ2Vals |           -86.8 |
|         StdQ2Vals |            1.65 |
|         MaxQ2Vals |           -81.7 |
|         MinQ2Vals |           -93.8 |
|      AverageLogPi |          -0.789 |
|          StdLogPi |           0.831 |
|          MaxLogPi |            3.32 |
|          MinLogPi |             -10 |


---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |            -462 |
|          StdEpRet |            18.2 |
|          MaxEpRet |            -422 |
|          MinEpRet |            -495 |
|  AverageTestEpRet |            -429 |
|      StdTestEpRet |            30.4 |
|      MaxTestEpRet |            -345 |
|      MinTestEpRet |            -462 |
|             EpLen |             100 |
|         TestEpLen |             100 |
| TotalEnvInteracts |        2.25e+04 |
|     AverageQ1Vals |           -96.8 |
|         StdQ1Vals |            1.58 |
|         MaxQ1Vals |           -91.9 |
|         MinQ1Vals |            -103 |
|     AverageQ2Vals |           -96.8 |
|         StdQ2Vals |            1.56 |
|         MaxQ2Vals |           -91.8 |
|         MinQ2Vals |            -103 |
|      AverageLogPi |          -0.494 |
|          StdLogPi |             1.1 |
|          MaxLogPi |            5.69 |
|          MinLogPi |           -7.68 |


KeyboardInterrupt: 