# Soft Actor-Critic (SAC)

# Model training with quadratic, exponential, and other reward functions on Env-v1

In [4]:
# trained in reward_training.py

In [5]:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
from os import path
from scipy.integrate import solve_ivp

import gym_gyroscope_env
import spinup
import torch
from functools import partial

from custom_functions.custom_functions import env_fn 
from custom_functions.custom_functions import create_env
from custom_functions.custom_functions import load_agent
from custom_functions.custom_functions import test_agent
from custom_functions.custom_functions import plot_test
from custom_functions.custom_functions import evaluate_control

#### Quadratic reward

In [3]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Quadratic'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_q', exp_name='sac_q')
                   )

[32;1mLogging data to sac_q/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Quadratic', reward_args={'qx1': 1, 'qx2': 0, 'qx3': 1, 'qx4': 0, 'pu1': 0, 'pu2': 0})",
    "epochs":	500,
    "exp_name":	"sac_q",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b575fac8>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_q",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_q",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_q/progress.txt' mode='w' encoding='UTF-8'>":	{
               

#### Absolute reward

In [6]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Absolute'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_a', exp_name='sac_a')
                   )

[32;1mLogging data to sac_a/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Absolute', reward_args={'qx1': 1, 'qx2': 0, 'qx3': 1, 'qx4': 0, 'pu1': 0, 'pu2': 0})",
    "epochs":	500,
    "exp_name":	"sac_a",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b4e20c50>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_a",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_a",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_a/progress.txt' mode='w' encoding='UTF-8'>":	{
                

#### Normalized reward

In [7]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Normalized'
reward_args = {
    'k': 1,
    'qx2': 1, 
    'qx4': 1, 
    'pu1': 0,
    'pu2': 0
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_n', exp_name='sac_n')
                   )

# Following parameters are used in previous project
# startsteps_b = 20000
# pilr_b = 0.001
# qlr_b = 0.001

[32;1mLogging data to sac_n/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Normalized', reward_args={'k': 1, 'qx2': 1, 'qx4': 1, 'pu1': 0, 'pu2': 0})",
    "epochs":	500,
    "exp_name":	"sac_n",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b4e20f60>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_n",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_n",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_n/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode"

#### Quadratic reward with ending penalty

In [8]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Quadratic with ending penalty'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'sx1': 100, 
    'sx3': 100, 
    'end_horizon': 0
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_q_ep', exp_name='sac_q_ep')
                   )

[32;1mLogging data to sac_q_ep/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Quadratic with ending penalty', reward_args={'qx1': 1, 'qx2': 0, 'qx3': 1, 'qx4': 0, 'pu1': 0, 'pu2': 0, 'sx1': 100, 'sx3': 100, 'end_horizon': 0})",
    "epochs":	500,
    "exp_name":	"sac_q_ep",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b4709128>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_q_ep",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_q_ep",
            "output_file":	{
                "<_io.TextIOWrappe

#### Quadratic reward with penalty 

In [9]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Quadratic with penalty'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'bound': 0.2,
    'penalty': 10
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_q_p', exp_name='sac_q_p')
                   )

[32;1mLogging data to sac_q_p/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Quadratic with penalty', reward_args={'qx1': 1, 'qx2': 0, 'qx3': 1, 'qx4': 0, 'pu1': 0, 'pu2': 0, 'bound': 0.2, 'penalty': 10})",
    "epochs":	500,
    "exp_name":	"sac_q_p",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b46e00f0>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_q_p",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_q_p",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_q_p/progress

#### Quadratic reward with exponential term

In [10]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Quadratic with exponential'
reward_args = {
    'qx1': 1,
    'qx2': 0,
    'qx3': 1,
    'qx4': 0,
    'pu1': 0,
    'pu2': 0,
    'eax1': 10,
    'ebx1': 10,
    'eax3': 10,
    'ebx3': 10
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_q_e', exp_name='sac_q_e')
                   )

[32;1mLogging data to sac_q_e/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Quadratic with exponential', reward_args={'qx1': 1, 'qx2': 0, 'qx3': 1, 'qx4': 0, 'pu1': 0, 'pu2': 0, 'eax1': 10, 'ebx1': 10, 'eax3': 10, 'ebx3': 10})",
    "epochs":	500,
    "exp_name":	"sac_q_e",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b46cf898>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_q_e",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_q_e",
            "output_file":	{
                "<_io.TextIOWrapper

#### Quadratic reward with bonus

In [11]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Quadratic with bonus'
reward_args = {
    'qx1': 1,
    'qx2': 0,
    'qx3': 1,
    'qx4': 0,
    'pu1': 0,
    'pu2': 0,
    'bound': 0.05,
    'bonus': 2
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_q_b', exp_name='sac_q_b')
                   )

[32;1mLogging data to sac_q_b/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Quadratic with bonus', reward_args={'qx1': 1, 'qx2': 0, 'qx3': 1, 'qx4': 0, 'pu1': 0, 'pu2': 0, 'bound': 0.05, 'bonus': 2})",
    "epochs":	500,
    "exp_name":	"sac_q_b",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b46f75f8>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_q_b",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_q_b",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_q_b/progress.txt

#### Normalized reward with bonus

In [12]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Normalized with bonus'
reward_args = {
    'k': 1,
    'qx2': 1, 
    'qx4': 1, 
    'pu1': 0,
    'pu2': 0,
    'bound': 0.05, 
    'bonus': 2
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_n_b', exp_name='sac_n_b')
                   )

[32;1mLogging data to sac_n_b/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Normalized with bonus', reward_args={'k': 1, 'qx2': 1, 'qx4': 1, 'pu1': 0, 'pu2': 0, 'bound': 0.05, 'bonus': 2})",
    "epochs":	500,
    "exp_name":	"sac_n_b",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b4e209b0>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_n_b",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_n_b",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_n_b/progress.txt' mode='w' 

#### Sparse reward

In [13]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Sparse'
reward_args = {
    'bx': 0.05,
    'rx': 1, 
    'bv': 0, 
    'rv': 0, 
    'bu': 0, 
    'ru': 0
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_s', exp_name='sac_s')
                   )

[32;1mLogging data to sac_s/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"ReLU",
        "hidden_sizes":	[
            128,
            32
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "alpha":	0.5,
    "batch_size":	100,
    "env_fn":	"functools.partial(<function env_fn at 0x7f13b575e268>, 'GyroscopeEnv-v1', simu_args={'dt': 0.05, 'ep_len': 100, 'seed': 2}, reward_func='Sparse', reward_args={'bx': 0.05, 'rx': 1, 'bv': 0, 'rv': 0, 'bu': 0, 'ru': 0})",
    "epochs":	500,
    "exp_name":	"sac_s",
    "gamma":	0.995,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f13b4708518>":	{
            "epoch_dict":	{},
            "exp_name":	"sac_s",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"sac_s",
            "output_file":	{
                "<_io.TextIOWrapper name='sac_s/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "

#### PE reward

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'PE'
reward_args = {
    'qx1': 1,
    'qx2': 0,
    'qx3': 1,
    'qx4': 0,
    'pu1': 0,
    'pu2': 0,
    'p': 0.1,
    'e': 40
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_p01e40', exp_name='sac_p01e40')
                   )

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'PE'
reward_args = {
    'qx1': 1,
    'qx2': 0,
    'qx3': 1,
    'qx4': 0,
    'pu1': 0,
    'pu2': 0,
    'p': 0.1,
    'e': 80
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.sac_pytorch(env_fn_,
                   ac_kwargs= dict(hidden_sizes=[128,32], activation=torch.nn.ReLU),
                   seed=0,
                   steps_per_epoch=1500,
                   epochs=500,
                   replay_size=1000000,
                   gamma=0.995,
                   polyak=0.995,
                   lr=0.0025,
                   alpha=0.5,   # Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.
                   batch_size=100,
                   start_steps=10000,
                   update_after=1000,
                   update_every=50,
                   num_test_episodes=10,
                   max_ep_len=100,
                   logger_kwargs=dict(output_dir='sac_p01e80', exp_name='sac_p01e80')
                   )

#### Power function reward

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Power'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'p': 0.05
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 0.995, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_p005', exp_name='ddpg_p005'))

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Power'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'p': 0.1
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 0.995, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_p01', exp_name='ddpg_p01'))

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Power'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'p': 0.5
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 0.995, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_p05', exp_name='ddpg_p05'))

#### Exponential reward

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Exponential'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'e': 10
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 0.995, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_e10', exp_name='ddpg_e10'))

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Exponential'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'e': 20
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 0.995, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_e20', exp_name='ddpg_e20'))

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Exponential'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'e': 40
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 0.995, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_e40', exp_name='ddpg_e40'))


#### further study on p = 0.05, why it doesnt learn

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Power'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'p': 0.05
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 0.95, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_p005_gamma095', exp_name='ddpg_p005_gamma095'))

In [None]:
# Env function
env_name = 'GyroscopeEnv-v1'
simu_args = {
    'dt': 0.05,
    'ep_len': 100,
    'seed': 2
}
reward_func = 'Power'
reward_args = {
    'qx1': 1, 
    'qx2': 0, 
    'qx3': 1, 
    'qx4': 0, 
    'pu1': 0, 
    'pu2': 0,
    'p': 0.05
}
env_fn_ = partial(env_fn, env_name, simu_args = simu_args, reward_func = reward_func, reward_args = reward_args)

# Baseline 0 training
spinup.ddpg_pytorch(env_fn_, 
                    ac_kwargs = dict(hidden_sizes=[128,32], activation=torch.nn.ReLU), 
                    seed = 0, 
                    steps_per_epoch = 1500, 
                    epochs = 500, 
                    replay_size = 1000000, 
                    gamma = 1, 
                    polyak = 0.995, 
                    pi_lr = 0.0025,
                    q_lr = 0.0025,
                    batch_size = 100, 
                    start_steps = 10000,
                    act_noise = 0.1,
                    max_ep_len = 100, 
                    logger_kwargs = dict(output_dir='ddpg_p005_gamma1', exp_name='ddpg_p005_gamma1'))