In [1]:
#installing libraries
!pip install grid2op
!pip install l2rpn-baselines
!pip install stable_baselines3
!pip install lightsim2grid



## 2 Create an environment, and train a first policy

In this section we quickly show :

- how to create the gym environment, which is an instance from `Grid2opEnvWrapper` defined above
- how to train a PPO policy using stable baselines3

This part, for stable baselines is really small.

In [2]:
from stable_baselines3 import PPO
from grid2op import make
from grid2op.gym_compat import GymEnv
from gymnasium import Env
from gymnasium.utils.env_checker import check_env
from gymnasium.spaces import Discrete, MultiDiscrete, Box
import grid2op
import json
from grid2op.gym_compat import GymEnv, BoxGymObsSpace, DiscreteActSpace, BoxGymActSpace, MultiDiscreteActSpace


try:
    from lightsim2grid import LightSimBackend
    bk_cls = LightSimBackend
except ImportError as exc:
    print(f"Error: {exc} when importing faster LightSimBackend")
    from grid2op.Backend import PandaPowerBackend
    bk_cls = PandaPowerBackend

env_name = 'l2rpn_case14_sandbox'
env = make(env_name, backend=bk_cls())
gym_env = GymEnv(env)

In [3]:
obs = env.reset()

  and should_run_async(code)


In [4]:
gym_env.observation_space

  and should_run_async(code)


Dict('_shunt_bus': Box(-2147483648, 2147483647, (1,), int32), '_shunt_p': Box(-inf, inf, (1,), float32), '_shunt_q': Box(-inf, inf, (1,), float32), '_shunt_v': Box(-inf, inf, (1,), float32), 'a_ex': Box(0.0, inf, (20,), float32), 'a_or': Box(0.0, inf, (20,), float32), 'actual_dispatch': Box([-140. -120.  -70.  -70.  -40. -100.], [140. 120.  70.  70.  40. 100.], (6,), float32), 'attention_budget': Box(0.0, inf, (1,), float32), 'current_step': Box(-2147483648, 2147483647, (1,), int32), 'curtailment': Box(0.0, 1.0, (6,), float32), 'curtailment_limit': Box(0.0, 1.0, (6,), float32), 'curtailment_limit_effective': Box(0.0, 1.0, (6,), float32), 'day': Discrete(32), 'day_of_week': Discrete(8), 'delta_time': Box(0.0, inf, (1,), float32), 'duration_next_maintenance': Box(-1, 2147483647, (20,), int32), 'gen_margin_down': Box(0.0, [ 5. 10.  0.  0.  0. 15.], (6,), float32), 'gen_margin_up': Box(0.0, [ 5. 10.  0.  0.  0. 15.], (6,), float32), 'gen_p': Box(-162.01, [302.01    282.01    232.01001 232.

In [5]:
# creating action space and observation space

gym_env.action_space = DiscreteActSpace(env.action_space,
                                        attr_to_keep=["set_bus" , "set_line_status_simple"])
gym_env.observation_space = BoxGymObsSpace(env.observation_space,
                                           attr_to_keep=["rho", "gen_p", "gen_q", "gen_v", "line_status", "load_p", "load_q", "load_v", "time_before_cooldown_sub", "time_before_cooldown_line"])


policy_kwargs = dict(net_arch=[dict(pi=[500, 1000, 500], vf=[500,1000,500])])

sb3_algo1 = PPO(env=gym_env,
               learning_rate=5e-5,
               policy="MlpPolicy",
               policy_kwargs=policy_kwargs,
               n_steps=512,
               batch_size=16,
               n_epochs=10,
               clip_range=0.1,
               gae_lambda=0.95,
               max_grad_norm=0.5,
               verbose=True,
               )




Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
gym_env.action_space

  and should_run_async(code)


Discrete(219)

In [7]:
gym_env.reset()

(array([ 81.4       ,  79.3       ,   5.3       ,   0.        ,
          0.        ,  82.24667   ,  19.496038  ,  71.34023   ,
         24.368923  ,  24.368923  ,  24.01807   , -17.27466   ,
        142.1       , 142.1       ,  22.        ,  22.        ,
         13.200001  , 142.1       ,   1.        ,   1.        ,
          1.        ,   1.        ,   1.        ,   1.        ,
          1.        ,   1.        ,   1.        ,   1.        ,
          1.        ,   1.        ,   1.        ,   1.        ,
          1.        ,   1.        ,   1.        ,   1.        ,
          1.        ,   1.        ,  21.9       ,  85.8       ,
         44.3       ,   6.9       ,  11.9       ,  28.5       ,
          8.8       ,   3.5       ,   5.4       ,  12.6       ,
         14.4       ,  15.4       ,  59.7       ,  30.8       ,
          4.8       ,   8.3       ,  19.4       ,   6.1       ,
          2.4       ,   3.9       ,   8.8       ,  10.5       ,
        142.1       , 142.1       , 138.

In [8]:
sb3_algo1.learn(total_timesteps=10000)

  and should_run_async(code)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.86     |
|    ep_rew_mean     | 173      |
| time/              |          |
|    fps             | 15       |
|    iterations      | 1        |
|    time_elapsed    | 33       |
|    total_timesteps | 512      |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3.75         |
|    ep_rew_mean          | 166          |
| time/                   |              |
|    fps                  | 12           |
|    iterations           | 2            |
|    time_elapsed         | 84           |
|    total_timesteps      | 1024         |
| train/                  |              |
|    approx_kl            | 0.0040791333 |
|    clip_fraction        | 0.261        |
|    clip_range           | 0.1          |
|    entropy_loss         | -5.39        |
|    explained_variance   | 0.000253     |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x79bb577b04c0>

## 3 Evaluate the trained agent

This notebook is a simple quick introduction for stable baselines only. So we don't really recall everything that has been said previously.

Please consult the section `0) Recommended initial steps` of the notebook [11_IntegrationWithExistingRLFrameworks](./11_IntegrationWithExistingRLFrameworks.ipynb) for more information.

**TLD;DR** grid2op offers the possibility to test your agent on scenarios / episodes different from the one it has been trained. We greatly encourage you to use this functionality.

There are two main ways to evaluate your agent:

- you stay in the "gymnasium" world (see [here](#31-staying-in-the-gymnasium-ecosystem) ) and you evaluate your policy directly just like you would any other gymnasium compatible environment. Simple, easy but without support for some grid2op features
- you "get back" to the "grid2op" world (detailed [here](#32-using-the-grid2op-ecosystem)) by "converting" your NN policy into something that is able to output grid2op like action. This introduces yet again a "wrapper" but you can benefit from all grid2op features, such as the `Runner` to save an inspect what your policy has done.

<font color='red'> We show here just a simple examples to "get easily started". For much better working agents, you can have a look at l2rpn-baselines code. There you have classes that maps the environment, the agents etc. to grid2op directly (you don't have to copy paste any wrapper).</font>



### 3.1 staying in the gymnasium ecosystem

You can do pretty much what you want, but you have to do it yourself, or use any of the "Wrappers" available in gymnasium https://gymnasium.farama.org/main/api/wrappers/ (*eg* https://gymnasium.farama.org/main/api/wrappers/misc_wrappers/#gymnasium.wrappers.RecordEpisodeStatistics) or in your RL framework.

For the sake of simplicity, we show how to do things "manually" even though we do not recommend to do it like that.

In [15]:
nb_episode_test = 4
seeds_test_env = (200,500,1000,2000)    # same size as nb_episode_test
seeds_test_agent = (103, 114, 145, 149)  # same size as nb_episode_test
ts_ep_test =  (0, 1,2,4)       # same size as nb_episode_test
ep_infos = {}  # information that will be saved


for ep_test_num in range(nb_episode_test):
    init_obs, init_infos = gym_env.reset(seed=seeds_test_env[ep_test_num],
                                         options={"time serie id": ts_ep_test[ep_test_num]})
    sb3_algo1.set_random_seed(seeds_test_agent[ep_test_num])
    done = False
    cum_reward = 0
    step_survived = 0
    obs = init_obs
    while not done:
        act, _states = sb3_algo1.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = gym_env.step(act)
        step_survived += 1
        cum_reward += float(reward)
        done = terminated or truncated
    ep_infos[ep_test_num] = {"time serie id": ts_ep_test[ep_test_num],
                             "time serie folder": gym_env.init_env.chronics_handler.get_id(),
                             "env seed": seeds_test_env[ep_test_num],
                             "agent seed": seeds_test_agent[ep_test_num],
                             "steps survived": step_survived,
                             "cum reward": cum_reward}

print(json.dumps(ep_infos, indent=4))

{
    "0": {
        "time serie id": 0,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0000",
        "env seed": 200,
        "agent seed": 200,
        "steps survived": 1089,
        "cum reward": 69109.22798919678
    },
    "1": {
        "time serie id": 1,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0001",
        "env seed": 500,
        "agent seed": 200,
        "steps survived": 806,
        "cum reward": 51495.505378723145
    },
    "2": {
        "time serie id": 2,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0002",
        "env seed": 1000,
        "agent seed": 200,
        "steps survived": 395,
        "cum reward": 27859.206676483154
    },
    "3": {
        "time serie id": 4,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0004",
        "env seed": 2000,
        "agent seed": 200,
        "steps survived": 802,
        "cum reward"

In [19]:
# normalised observation

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

vec_env = DummyVecEnv([lambda: gym_env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True)


policy_kwargs = dict(net_arch=[dict(pi=[500, 1000, 500], vf=[500,1000,500])])

nn_model = PPO(env=vec_env,
               learning_rate=5e-5,
               policy="MlpPolicy",
               policy_kwargs=policy_kwargs , #{"net_arch": [100, 100, 100]},
               n_steps=512,
               batch_size=16,
               verbose=True,
               )
nn_model.learn(total_timesteps=10000)

Using cpu device




----------------------------
| time/              |     |
|    fps             | 17  |
|    iterations      | 1   |
|    time_elapsed    | 28  |
|    total_timesteps | 512 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 14          |
|    iterations           | 2           |
|    time_elapsed         | 68          |
|    total_timesteps      | 1024        |
| train/                  |             |
|    approx_kl            | 0.026200553 |
|    clip_fraction        | 0.345       |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.38       |
|    explained_variance   | -0.208      |
|    learning_rate        | 5e-05       |
|    loss                 | 0.0442      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0936     |
|    value_loss           | 1           |
-----------------------------------------
-----------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x79bb55887460>

In [26]:
nb_episode_test = 4
seeds_test_env = (200,500,1000,2000)    # same size as nb_episode_test
seeds_test_agent = (103, 114, 145, 149)  # same size as nb_episode_test
ts_ep_test =  (0, 1,2,3)       # same size as nb_episode_test
ep_infos = {}  # information that will be saved


for ep_test_num in range(nb_episode_test):
    init_obs, init_infos = gym_env.reset(seed=seeds_test_env[ep_test_num],
                                         options={"time serie id": ts_ep_test[ep_test_num]})
    nn_model.set_random_seed(seeds_test_agent[ep_test_num])
    done = False
    cum_reward = 0
    step_survived = 0
    obs = init_obs
    while not done:
        act, _states = nn_model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = gym_env.step(act)
        step_survived += 1
        cum_reward += float(reward)
        done = terminated or truncated
    ep_infos[ep_test_num] = {"time serie id": ts_ep_test[ep_test_num],
                             "time serie folder": gym_env.init_env.chronics_handler.get_id(),
                             "env seed": seeds_test_env[ep_test_num],
                             "agent seed": seeds_test_agent[ep_test_num],
                             "steps survived": step_survived,
                             "cum reward": cum_reward}

print(json.dumps(ep_infos, indent=4))

{
    "0": {
        "time serie id": 0,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0000",
        "env seed": 200,
        "agent seed": 103,
        "steps survived": 1091,
        "cum reward": 69236.51405715942
    },
    "1": {
        "time serie id": 1,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0001",
        "env seed": 500,
        "agent seed": 114,
        "steps survived": 807,
        "cum reward": 51564.479511260986
    },
    "2": {
        "time serie id": 2,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0002",
        "env seed": 1000,
        "agent seed": 145,
        "steps survived": 3001,
        "cum reward": 191833.17527008057
    },
    "3": {
        "time serie id": 3,
        "time serie folder": "/root/data_grid2op/l2rpn_case14_sandbox/chronics/0003",
        "env seed": 2000,
        "agent seed": 149,
        "steps survived": 3,
        "cum reward":