### Direct Environment and Policy Search on the MSD


Instantiate the MSD environment and some wrappers.
The first wrapper one-hot-encodes of the actions, the second normalizes the state,
and the last adds a time dependency to the state.

In [1]:
from system.MSD.MSDSystem import MSDSystem
from system.Wrappers.OneHotEncodingWrapper import OHEWrapper
from system.Wrappers.StateScaleWrapper import StateScaleWrapper
from system.Wrappers.StationarySystemWrapper import StationarySystemWrapper

sys = MSDSystem(horizon=100,
                equilibrium=0.2,
                actions_value=[-0.3, -0.1, 0.0, 0.1, 0.3],
                target_parameters_reward=[0.5, -0.3, 0.1],
                cost_omega_zeta=[0.5, 0.5],
                accuracy=1.0,
                actions_discretization=0.05,
                position_interval=[0.198, 0.202],
                speed_interval=[-0.01, 0.01],
                feasible_set={"omega_interval": [0.1, 1.5],
                              "zeta_interval": [0.1, 1.5],
                              "phi_interval": [-2.0, 2.0]})

sys = OHEWrapper(sys)
sys = StateScaleWrapper(sys,
                        loc=[0.2, 0],
                        scale=[0.005, 0.02])
sys = StationarySystemWrapper(sys)


Instantiate a deterministic agent depending on two models (parametric functions).
The first model stands for the system parameters and the second is a parametric control policy.

In [2]:
from agent.trainable.DESGA.DSSAAgent import DSSAAgent
from model.investment.DeterministicParameterModel import DeterministicParameterModel
from model.decision.CategoricalDistModel import CategoricalDistModel

agent = DSSAAgent(InvestmentModel=DeterministicParameterModel,
                  OperationModel=CategoricalDistModel).initialize(env=sys,
                                                                  investment_pol=dict(),
                                                                  operation_pol={"layers": (64,)})


Instantiate a log for the learning curves.

In [3]:
import os

from utils import get_version
from algo.joint.LoggerDESGA import LoggerDESGA

logdir = "./experiments"
model_name = "msd-env"
v = get_version(model_name, logdir)
log_path = os.path.join(logdir, f"{model_name}-v{v}")
logger = LoggerDESGA(log_path)


Instantiate an algorithm for learning the parameters of the agent's models for maximizing the return in the environment.

In [4]:
from algo.joint.DESGA import ReinforceDESGA

algo = ReinforceDESGA(env=sys, agent=agent)
algo.initialize(nb_iterations=500,
                optimizer_parameters={"lr": 0.005},
                batch_size=64,
                mc_samples=64,
                system_fit=True)

Fit the agent with the algorithm.

In [5]:
algo.fit(logger)


Simulate the policy in the environment and print the expected return.

In [6]:
from runner.TrajectoriesSampler import TrajectoriesSampler

sampler = TrajectoriesSampler(sys, agent)
_, _, reward_batch, _, _ = sampler.sample(100)
print("Average cumulative reward : ", sampler.cumulative_reward(reward_batch))




Average cumulative reward :  99.9478530883789
