# DQN Baseline with the Entire Observation and Action Space

In [None]:
!pip install gymnasium Grid2Op lightsim2grid wandb stable-baselines3[extra] sb3-contrib

### Observation Space

In [None]:
#The entire observation Space is in Keys_array

keys_array = [
    'a_ex', 'a_or', 'active_alert', 'actual_dispatch', 'alert_duration', 'attack_under_alert',
    'attention_budget', 'current_step', 'curtailment', 'curtailment_limit',
    'curtailment_limit_effective', 'curtailment_limit_mw', 'curtailment_mw', 'day',
    'day_of_week', 'delta_time', 'duration_next_maintenance', 'gen_margin_down',
    'gen_margin_up', 'gen_p', 'gen_p_before_curtail', 'gen_q', 'gen_theta', 'gen_v',
    'hour_of_day', 'is_alarm_illegal', 'last_alarm', 'line_status', 'load_p', 'load_q',
    'load_theta', 'load_v', 'max_step', 'minute_of_hour', 'month', 'p_ex', 'p_or',
    'prod_p', 'prod_q', 'prod_v', 'q_ex', 'q_or', 'rho', 'storage_charge',
    'storage_power', 'storage_power_target', 'storage_theta', 'target_dispatch',
    'thermal_limit', 'theta_ex', 'theta_or', 'time_before_cooldown_line',
    'time_before_cooldown_sub', 'time_next_maintenance', 'time_since_last_alarm',
    'time_since_last_alert', 'time_since_last_attack', 'timestep_overflow', 'topo_vect',
    'total_number_of_alert', 'v_ex', 'v_or', 'was_alarm_used_after_game_over',
    'was_alert_used_after_attack', 'year'
]




### Environment - Grid2Op

In [None]:
import gymnasium as gym
from gymnasium.spaces import Discrete, MultiDiscrete, Box

import grid2op
from grid2op import gym_compat
from grid2op.Parameters import Parameters
from grid2op.Action import PlayableAction
from grid2op.Observation import CompleteObservation
from grid2op.Reward import L2RPNReward, N1Reward, CombinedScaledReward
from grid2op.gym_compat import DiscreteActSpace, BoxGymObsSpace

from lightsim2grid import LightSimBackend
from collections import Counter



# Gymnasium environment wrapper around Grid2Op environment
class Gym2OpEnv(gym.Env):
    def __init__(self):
        super().__init__()

        self._backend = LightSimBackend()
        self._env_name = "l2rpn_case14_sandbox"  # DO NOT CHANGE

        action_class = PlayableAction
        observation_class = CompleteObservation
        reward_class = CombinedScaledReward  # Setup further below

        # DO NOT CHANGE Parameters
        # See https://grid2op.readthedocs.io/en/latest/parameters.html
        p = Parameters()
        p.MAX_SUB_CHANGED = 4  # Up to 4 substations can be reconfigured each timestep
        p.MAX_LINE_STATUS_CHANGED = 4  # Up to 4 powerline statuses can be changed each timestep

        # Make grid2op env
        self._g2op_env = grid2op.make(
            self._env_name, backend=self._backend, test=False,
            action_class=action_class, observation_class=observation_class,
            reward_class=reward_class, param=p
        )

        ##########
        # REWARD #
        ##########
        # NOTE: This reward should not be modified when evaluating RL agent
        # See https://grid2op.readthedocs.io/en/latest/reward.html
        cr = self._g2op_env.get_reward_instance()
        cr.addReward("N1", N1Reward(), 1.0)
        cr.addReward("L2RPN", L2RPNReward(), 1.0)
        # reward = N1 + L2RPN
        cr.initialize(self._g2op_env)
        ##########

        self._gym_env = gym_compat.GymEnv(self._g2op_env)

        self.setup_observations()
        self.setup_actions()

        self.observation_space = self._gym_env.observation_space
        self.action_space = self._gym_env.action_space

        self.action_counter = Counter()



    # The information i used to get the code for the below 2 functions is fromm the getting started from the Grid2Op github.
    # specifcally theis link was useful at helping change the observation and action spaces to use with gymnasium : https://github.com/rte-france/Grid2Op/blob/c71a2dfb824dae7115394266e02cc673c8633a0e/getting_started/11_IntegrationWithExistingRLFrameworks.ipynb
    # these links also help explain the observation and action space:
        # https://github.com/rte-france/Grid2Op/blob/c71a2dfb824dae7115394266e02cc673c8633a0e/getting_started/02_Observation.ipynb
        # https://github.com/rte-france/Grid2Op/blob/c71a2dfb824dae7115394266e02cc673c8633a0e/getting_started/03_Action.ipynb

    def setup_observations(self):
        #uses the entire observation space
        self._gym_env.observation_space.close()
        
        self._gym_env.observation_space = BoxGymObsSpace(self._g2op_env.observation_space)
        

    def setup_actions(self):
        #Uses the entire action space
        self._gym_env.action_space = DiscreteActSpace(self._g2op_env.action_space)

       

    def reset(self, seed=None):
        return self._gym_env.reset(seed=seed, options=None)

    def step(self, action):
        action_key = tuple(action) if isinstance(action, np.ndarray) else action
        self.action_counter[action_key] += 1
        return self._gym_env.step(action)

    def get_action_distribution(self):
        total = sum(self.action_counter.values())
        return {action: count / total for action, count in self.action_counter.items()}

    def reset_action_counter(self):
        self.action_counter = Counter()


    def render(self):
        
        return self._gym_env.render()


def main():
    # Random agent interacting in environment #

    max_steps = 100

    env = Gym2OpEnv()

    print("#####################")
    print("# OBSERVATION SPACE #")
    print("#####################")
    print(env.observation_space)
    print("#####################\n")

    print("#####################")
    print("#   ACTION SPACE    #")
    print("#####################")
    print(env.action_space)
    print("#####################\n\n")

    curr_step = 0
    curr_return = 0

    is_done = False
    obs, info = env.reset()
    print(f"step = {curr_step} (reset):")
    print(f"\t obs = {obs}")
    print(f"\t info = {info}\n\n")

    while not is_done and curr_step < max_steps:
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)

        curr_step += 1
        curr_return += reward
        is_done = terminated or truncated

        print(f"step = {curr_step}: ")
        print(f"\t obs = {obs}")
        print(f"\t reward = {reward}")
        print(f"\t terminated = {terminated}")
        print(f"\t truncated = {truncated}")
        print(f"\t info = {info}")

        # Some actions are invalid (see: https://grid2op.readthedocs.io/en/latest/action.html#illegal-vs-ambiguous)
        # Invalid actions are replaced with 'do nothing' action
        is_action_valid = not (info["is_illegal"] or info["is_ambiguous"])
        print(f"\t is action valid = {is_action_valid}")
        if not is_action_valid:
            print(f"\t\t reason = {info['exception']}")
        print("\n")

    print("###########")
    print("# SUMMARY #")
    print("###########")
    print(f"return = {curr_return}")
    print(f"total steps = {curr_step}")
    print("###########")


if __name__ == "__main__":
    main()


In [15]:
#Analyzing the action distrubtion after training
def analyze_action_distribution(env, model, episodes = 1000):
    env.reset_action_counter()
    for _ in range(episodes):
        obs, _ = env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs, deterministic = True)
            obs, reward, done,terminated, truncated, info = env.step(action)
            done = terminated or truncated
    distribution = env.get_action_distribution()
    return distribution

In [None]:
import gymnasium as gym
import grid2op
from stable_baselines3.common.monitor import Monitor
import wandb
import sys
sys.path.insert(0,"./") # this makes it possible to import the wrapper since the env.py is in it's own folder
#from provided_wrapper.env import Gym2OpEnv
from stable_baselines3 import DQN
from wandb.integration.sb3 import WandbCallback
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from sb3_contrib import QRDQN
import numpy as np


def main():
    env = Gym2OpEnv()
    env = Monitor(env)

    #Connecting to weights and biases to log the results of the model
    run = wandb.init(
        project="RL_project",
        name = "DQN_Baseline ",
        sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
        monitor_gym=True,  # auto-upload the videos of agents playing the game
        save_code=False,  # optional
    )

    #Model - standard DQN from stable baselines 3
    model = DQN("MlpPolicy", env, verbose =1, tensorboard_log="./dqn_grid2op_tensorboard/")

    #Training the model
    model.learn(total_timesteps = 100000, log_interval = 10, callback=WandbCallback(gradient_save_freq=100, model_save_path=f"models/{run.id}",verbose=2), progress_bar=True)

    action_distribution = analyze_action_distribution(env, model)
    print("Action Distribution:")
    for action, freq in sorted(action_distribution.items(), key=lambda x: x[1], reverse=True):
        print(f"Action {action}: {freq * 100:.2f}%")

    run.finish()

    #Run for a few episodes with trained agent
    obs,_ = env.reset()

    for _ in range(1000):
        action, _states = model.predict(obs, deterministic = True)
        obs, reward, done, info = env.step(action)
        env.render()
        if done:
            obs = env.reset()

    env.close()


if __name__ == "__main__":
    main()