# QR-DQN - Improvement 2

In [None]:
!pip install gymnasium Grid2Op lightsim2grid wandb stable-baselines3[extra] sb3-contrib

## Observation Space

In [None]:
attr_to_keep_no_storage = ['a_ex', 'a_or', 'active_alert', 'actual_dispatch', 'alert_duration',
    'attention_budget', 'current_step','curtailment', 'curtailment_limit', 'curtailment_limit_effective', 'curtailment_limit_mw','day',
    'day_of_week', 'delta_time', 'duration_next_maintenance', 'gen_margin_down',
    'gen_margin_up', 'gen_p', 'gen_p_before_curtail', 'gen_q', 'gen_theta', 'gen_v',
    'hour_of_day', 'last_alarm', 'line_status', 'load_p', 'load_q',
    'load_theta', 'load_v', 'max_step', 'minute_of_hour', 'month', 'p_ex', 'p_or',
    'prod_p', 'prod_q', 'prod_v', 'q_ex', 'q_or', 'rho',
     'target_dispatch',
    'thermal_limit', 'theta_ex', 'theta_or', 'time_before_cooldown_line',
    'time_before_cooldown_sub', 'time_next_maintenance',
    'timestep_overflow', 'topo_vect','v_ex', 'v_or',
     'year']#so far the best

## Environment - Grid2Op

In [None]:
import gymnasium as gym
from gymnasium.spaces import Discrete, MultiDiscrete, Box

import grid2op
from grid2op import gym_compat
from grid2op.Parameters import Parameters
from grid2op.Action import PlayableAction
from grid2op.Observation import CompleteObservation
from grid2op.Reward import L2RPNReward, N1Reward, CombinedScaledReward
from grid2op.gym_compat import DiscreteActSpace, BoxGymObsSpace

from lightsim2grid import LightSimBackend
import numpy as np


# Gymnasium environment wrapper around Grid2Op environment
class Gym2OpEnv(gym.Env):
    def __init__(self, attr_to_keep):
        super().__init__()

        self._backend = LightSimBackend()
        self._env_name = "l2rpn_case14_sandbox"  # DO NOT CHANGE

        action_class = PlayableAction
        observation_class = CompleteObservation
        reward_class = CombinedScaledReward  # Setup further below

        # DO NOT CHANGE Parameters
        # See https://grid2op.readthedocs.io/en/latest/parameters.html
        p = Parameters()
        p.MAX_SUB_CHANGED = 4  # Up to 4 substations can be reconfigured each timestep
        p.MAX_LINE_STATUS_CHANGED = 4  # Up to 4 powerline statuses can be changed each timestep

        # Make grid2op env
        self._g2op_env = grid2op.make(
            self._env_name, backend=self._backend, test=False,
            action_class=action_class, observation_class=observation_class,
            reward_class=reward_class, param=p
        )

        ##########
        # REWARD #
        ##########
        # NOTE: This reward should not be modified when evaluating RL agent
        # See https://grid2op.readthedocs.io/en/latest/reward.html
        cr = self._g2op_env.get_reward_instance()
        cr.addReward("N1", N1Reward(), 1.0)
        cr.addReward("L2RPN", L2RPNReward(), 1.0)
        # reward = N1 + L2RPN
        cr.initialize(self._g2op_env)
        ##########

        self._gym_env = gym_compat.GymEnv(self._g2op_env)

        self.setup_observations()
        self.setup_actions()

        self.observation_space = self._gym_env.observation_space
        self.action_space = self._gym_env.action_space
        self.attr_to_keep = attr_to_keep

    # The information i used to get the code for the below 2 functions is fromm the getting started from the Grid2Op github.
    # specifcally theis link was useful at helping change the observation and action spaces to use with gymnasium : https://github.com/rte-france/Grid2Op/blob/c71a2dfb824dae7115394266e02cc673c8633a0e/getting_started/11_IntegrationWithExistingRLFrameworks.ipynb
    # these links also help explain the observation and action space:
        # https://github.com/rte-france/Grid2Op/blob/c71a2dfb824dae7115394266e02cc673c8633a0e/getting_started/02_Observation.ipynb
        # https://github.com/rte-france/Grid2Op/blob/c71a2dfb824dae7115394266e02cc673c8633a0e/getting_started/03_Action.ipynb

    def setup_observations(self):
        
        self._gym_env.observation_space.close()
        #Observation space = attr_to_keep_no_storage
        self._gym_env.observation_space = BoxGymObsSpace(self._g2op_env.observation_space, attr_to_keep = attr_to_keep)
       

    def setup_actions(self):
        
        #using the full action space for the QR-DQN distribution
        self._gym_env.action_space = DiscreteActSpace(self._g2op_env.action_space)
        

    def reset(self, seed=None):
        return self._gym_env.reset(seed=seed, options=None)

    def step(self, action):
        return self._gym_env.step(action)

    def render(self):
        
        return self._gym_env.render()


def main():
    # Random agent interacting in environment #

    max_steps = 100

    env = Gym2OpEnv(attr_to_keep_3)

    print("#####################")
    print("# OBSERVATION SPACE #")
    print("#####################")
    print(env.observation_space)
    print("#####################\n")

    print("#####################")
    print("#   ACTION SPACE    #")
    print("#####################")
    print(env.action_space)
    print("#####################\n\n")

    curr_step = 0
    curr_return = 0

    is_done = False
    obs, info = env.reset()
    print(f"step = {curr_step} (reset):")
    print(f"\t obs = {obs}")
    print(f"\t info = {info}\n\n")

    while not is_done and curr_step < max_steps:
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)

        curr_step += 1
        curr_return += reward
        is_done = terminated or truncated

        print(f"step = {curr_step}: ")
        print(f"\t obs = {obs}")
        print(f"\t reward = {reward}")
        print(f"\t terminated = {terminated}")
        print(f"\t truncated = {truncated}")
        print(f"\t info = {info}")

        # Some actions are invalid (see: https://grid2op.readthedocs.io/en/latest/action.html#illegal-vs-ambiguous)
        # Invalid actions are replaced with 'do nothing' action
        is_action_valid = not (info["is_illegal"] or info["is_ambiguous"])
        print(f"\t is action valid = {is_action_valid}")
        if not is_action_valid:
            print(f"\t\t reason = {info['exception']}")
        print("\n")

    print("###########")
    print("# SUMMARY #")
    print("###########")
    print(f"return = {curr_return}")
    print(f"total steps = {curr_step}")
    print("###########")


if __name__ == "__main__":
    main()


In [None]:
from stable_baselines3.common.callbacks import BaseCallback
from collections import defaultdict
import numpy as np
# Custom Action Tracker Callback
class ActionTrackerCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(ActionTrackerCallback, self).__init__(verbose)
        self.action_counter = defaultdict(int)

    def _on_step(self):
        # Track the actions taken during training
        action = self.locals['actions']
        if isinstance(action, np.ndarray):
            action_key = tuple(action)
        else:
            action_key = (action,)
        self.action_counter[action_key] += 1
        return True

    def _on_training_end(self):
        # Called at the end of training - print action distribution here
        print("Training completed! Action distribution:")
        for action, freq in sorted(self.action_counter.items(), key=lambda x: x[1], reverse=True):
            print(f"Action: {action}, Frequency: {freq}")

    def get_action_distribution(self):
        return dict(self.action_counter)

In [None]:
def evaluate_model(model, env, num_episodes=10):
    all_episode_rewards = []
    all_episode_lengths = []
    all_n1_rewards = []
    all_l2rpn_rewards = []

    for _ in range(num_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0
        episode_length = 0
        n1_reward = 0
        l2rpn_reward = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, info = env.step(action)
            episode_reward += reward
            episode_length += 1

            # Extract individual rewards
            n1_reward += info.get('rewards', {}).get('N1', 0)
            l2rpn_reward += info.get('rewards', {}).get('L2RPN', 0)

        all_episode_rewards.append(episode_reward)
        all_episode_lengths.append(episode_length)
        all_n1_rewards.append(n1_reward)
        all_l2rpn_rewards.append(l2rpn_reward)

    mean_reward = np.mean(all_episode_rewards)
    std_reward = np.std(all_episode_rewards)
    mean_length = np.mean(all_episode_lengths)
    std_length = np.std(all_episode_lengths)
    mean_n1 = np.mean(all_n1_rewards)
    std_n1 = np.std(all_n1_rewards)
    mean_l2rpn = np.mean(all_l2rpn_rewards)
    std_l2rpn = np.std(all_l2rpn_rewards)

    return mean_reward, std_reward, mean_length, std_length, mean_n1, std_n1, mean_l2rpn, std_l2rpn

In [None]:
import gymnasium as gym
import grid2op
from stable_baselines3.common.monitor import Monitor
import wandb
import sys
sys.path.insert(0,"./") # this makes it possible to import the wrapper since the env.py is in it's own folder
#from provided_wrapper.env import Gym2OpEnv
from stable_baselines3 import DQN
from wandb.integration.sb3 import WandbCallback
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from sb3_contrib import QRDQN
import numpy as np


def main():
    # Initialize environment and monitor
    env = Gym2OpEnv(attr_to_keep_no_storage)
    env = Monitor(env)

    # Initialize Wandb
    run = wandb.init(
        project="RL_project",
        name="DQN Improvemen 1",
        sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
        monitor_gym=True,  # auto-upload the videos of agents playing the game
        save_code=False,  # optional
    )

    # Initialize QR-DQN model
    

    policy_kwargs = dict(n_quantiles=8)
    model2 = QRDQN("MlpPolicy", env, device=device, policy_kwargs=policy_kwargs,tensorboard_log="./dqn_grid2op_tensorboard/")
    #Initialize custom callback to track actions
    action_callback = ActionTrackerCallback()

    # Training the model with Wandb and custom action tracking callback
    model2.learn(
        total_timesteps=1000000,
        log_interval=10,
        callback=[WandbCallback(gradient_save_freq=100, model_save_path=f"models/{run.id}", verbose=1), action_callback],
        progress_bar=True
    )

    # After training, print the action distribution
    action_distribution = action_callback.get_action_distribution()

    print("\nFinal Action Distribution:")
    for action, freq in sorted(action_distribution.items(), key=lambda x: x[1], reverse=True):
        print(f"Action: {action}, Frequency: {freq}")


    #Evaluating the trained model
    print("\nEvaluating the trained model")
    mean_reward, std_reward, mean_length, std_length, mean_n1, std_n1, mean_l2rpn, std_l2rpn = evaluate_model(model2, env, num_episodes=50)
    wandb.log({
        "eval/mean_total_reward": mean_reward,
        "eval/std_total_reward": std_reward,
        "eval/mean_episode_length": mean_length,
        "eval/std_episode_length": std_length,
        "eval/mean_N1_reward": mean_n1,
        "eval/std_N1_reward": std_n1,
        "eval/mean_L2RPN_reward": mean_l2rpn,
        "eval/std_L2RPN_reward": std_l2rpn
    })

    print(f"\nEvaluation Results:")
    print(f"Mean Total Reward: {mean_reward:.2f} +/- {std_reward:.2f}")
    print(f"Mean Episode Length: {mean_length:.2f} +/- {std_length:.2f}")
    print(f"Mean N1 Reward: {mean_n1:.2f} +/- {std_n1:.2f}")
    print(f"Mean L2RPN Reward: {mean_l2rpn:.2f} +/- {std_l2rpn:.2f}")
    run.finish()

    # Test the trained model
    obs, _ = env.reset()  # Get initial observation and info
    for _ in range(1000):
        # Predict action using only the observation part of the returned tuple
        action, _states = model2.predict(obs, deterministic=True)
        obs, reward, done, _, info = env.step(action)
        env.render()
        if done:
            obs, _ = env.reset()  # Reset environment if done, and unpack the tuple again

    env.close()


if __name__ == "__main__":
    main()