In [1]:
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import SAC
import logging
import gym
import numpy as np
import os

# add reference libraries here. Current structure will use the relative path from this file
libs = ["../../OpenIPSL-1.5.0/OpenIPSL/package.mo",
        "../resources/KundurSMIB/package.mo"] # KundurSMIB modified to have voltage control

# check that all the paths to library package.mo files exist
# DymolaInterface() also checks this but I've found this warning helpful
for lib in libs:
    if not os.path.isfile(lib):
        print(f"Cannot find the library {lib}")

mo_name = "KundurSMIB.SMIB_vref" # name of Modelica model in the Library.Model format
env_entry_point = 'examples:DymSMIBEnv' # Python package location of RL environment

v_ref = 1
time_step = 1 # time delta in seconds
positive_reward = 1
negative_reward = -100 # penalize RL agent for is_done
log_level = logging.DEBUG

# these config values are passed to the model specific environment class
# mo_name and libs are passed on to the DymolaBaseEnv class
config = {
    'mo_name': mo_name,
    'libs': libs,
    'v_ref': v_ref,
    'time_step': time_step,
    'positive_reward': positive_reward,
    'negative_reward': negative_reward,
    'log_level': log_level
}

# enable the model specific class as an OpenAI gym environment
from gym.envs.registration import register
env_name = "MicrogridEnv-v0"

register(
    id=env_name,
    entry_point=env_entry_point,
    kwargs=config
)

In [2]:
# create the environment. this will run an initial step and must return [True, [...]] or something is broken
# TODO: create error handling/warnings if simulations don't work (i.e. returns [False], [...])
env = gym.make(env_name)

[True, [0.9415191647391123]]




In [3]:
# this bit is for normalizing the reward later (to improve training), can be safely ignored for now
min_reward = np.inf
max_reward = -np.inf
avg_reward = 0
obs = env.reset()

# show performance over 10 seconds in a do-nothing case (control voltage set at 1.0 pu)
for _ in range(10):
    action = [1.0] # control voltage = 1.0 pu
    obs, reward, done, info = env.step(action)
    if done:
        env.reset()
    
    # a continuation of the reward normalizing piece (can be ignored for now)
    avg_reward += 1/30 * reward
    if reward < min_reward:
        min_reward = reward
    if reward > max_reward:
        max_reward = reward

[True, [0.9415191647391123]]
[True, [0.7838888263120665]]
[True, [0.5778502347409977]]
[True, [0.37689214331774173]]
[True, [-0.4102310273435199]]
[True, [-0.39768415229741216]]
[True, [0.26086190725956715]]
[True, [0.2306681763146432]]
[True, [-0.3630533068700842]]
[True, [-0.20653164435361318]]
[True, [-0.3770097767658315]]


In [None]:
# reset environment
obs = env.reset()

# run a randomized agent to verify:
#    (1) that the simulation runs when we are controlling and changing an input value
#    (2) that the simulation outputs different results than the do-nothing or rule-based controller
for _ in range(10):
    action = [np.random.uniform(1.0,2.0)]
    obs, reward, done, info = env.step(action)
    if done:
        env.reset()

In [None]:
# create a stable-baselines Soft Actor Critic agent
model = SAC(MlpPolicy, env, verbose=1, tensorboard_log="tensorboard_logs")

# run a short training period to verify that the syntax is ok
print("Training the model...")
obs = env.reset()
model.learn(total_timesteps=20, tb_log_name="microgrid")

# run a short test period to verify that the syntax is ok
print("Testing the model...")
obs = env.reset()
rl_reward = 0
for _ in range(10):
    action, _state = model.predict(obs)
    obs, reward, done, info = env.step(action)
    if done:
        env.reset()
    rl_reward += reward