In [13]:
import os
import json
import math
import numpy as np
import tensorflow as tf
import torch

import grid2op
from d3qn.adversary import D3QN_Opponent
from grid2op.Agent import DoNothingAgent
from grid2op.Action import TopologyChangeAndDispatchAction
from grid2op.Reward import CombinedScaledReward, L2RPNSandBoxScore, L2RPNReward, GameplayReward
from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg

from kaist_agent.Kaist import Kaist

In [14]:
MAX_TIMESTEP = 7 * 288

def train_adversary(env, agent, opponent, num_pre_training_steps, n_iter, save_path, log_path):
    # Make sure we can fill the experience buffer
    if num_pre_training_steps < opponent.batch_size * opponent.num_frames:
        num_pre_training_steps = opponent.batch_size * opponent.num_frames
        
    # Loop vars
    num_training_steps = n_iter
    num_steps = num_pre_training_steps + num_training_steps
    step = 0
    alive_steps = 0
    total_reward = 0
    done = True
    print(f"Total number of steps: {num_steps}")

    # Create file system related vars
    logpath = os.path.join(log_path, opponent.name)
    os.makedirs(save_path, exist_ok=True)
    modelpath = os.path.join(save_path, opponent.name + ".h5")
    opponent.tf_writer = tf.summary.create_file_writer(logpath, name=opponent.name)
    opponent._save_hyperparameters(save_path, env, num_steps)
    
    while step < num_steps:
        # Init first time or new episode
        if done:
            new_obs = env.reset() # This shouldn't raise
            agent.reset(new_obs)
            opponent.reset(new_obs)
        if cfg.VERBOSE and step % 1000 == 0:
            print("Step [{}] -- Random [{}]".format(step, opponent.epsilon))

        # Save current observation to stacking buffer
        opponent._save_current_frame(opponent.state)

        # Execute attack if allowed
        if step <= num_pre_training_steps:
            opponent.remaining_time = 0
            attack, a = opponent._do_nothing, 0
        else:
            attack, a = opponent.attack(new_obs)

        if a != 0:
            print(f'ATTACK step {step}: disconnected {a}')
            attack_obs, opp_reward, done, info = env.step(attack)
            if info["is_illegal"] or info["is_ambiguous"] or \
               info["is_dispatching_illegal"] or info["is_illegal_reco"]:
                if cfg.VERBOSE:
                    print(attack, info)
            new_obs = attack_obs
            opponent.tell_attack_continues(None, None, None, None)

        while opponent.remaining_time >= 0:
            new_obs.time_before_cooldown_line[opponent.attack_line] = opponent.remaining_time
            response = agent.act(new_obs, None, None)
            new_obs, reward, done, info = env.step(response)
            opponent.remaining_time -= 1
            if done:
                break
        
        # Save new observation to stacking buffer
        new_state = opponent.convert_obs(new_obs)
        opponent._save_next_frame(new_state)

        # Save to experience buffer
        if len(opponent.frames2) == opponent.num_frames:
            opponent.per_buffer.add(np.array(opponent.frames),
                                a, -1 * reward,
                                np.array(opponent.frames2),
                                opponent.done)

        total_reward += reward

        # Perform training when we have enough experience in buffer
        if step >= num_pre_training_steps:
            training_step = step - num_pre_training_steps
            # Decay chance of random action
            opponent.epsilon = opponent._adaptive_epsilon_decay(training_step)

            # Perform training at given frequency
            if step % cfg.UPDATE_FREQ == 0 and \
               len(opponent.per_buffer) >= opponent.batch_size:
                # Perform training
                opponent._batch_train(training_step, step)

                if cfg.UPDATE_TARGET_SOFT_TAU > 0.0:
                    tau = cfg.UPDATE_TARGET_SOFT_TAU
                    # Update target network towards primary network
                    opponent.policy_net.update_target_soft(opponent.target_net.model, tau)

            # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely
            if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \
               step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0:
                opponent.policy_net.update_target_hard(opponent.target_net.model)
        
        if done:
            opponent.epoch_rewards.append(-1 * total_reward)
            opponent.epoch_alive.append(alive_steps)
            if cfg.VERBOSE and step > num_pre_training_steps:
                print("step {}: Survived [{}] steps".format(step, alive_steps))
                print("Total reward of agent [{}]".format(total_reward))
            alive_steps = 0
            total_reward = 0         
        else:
            alive_steps += 1
            
        ######## After Each Step #######
        if step > 0 and step % 2000 == 0: # save network every 5000 iters
            opponent.save(modelpath)
        step += 1
        # Make new obs the current obs
        opponent.obs = new_obs
        opponent.state = new_state

    # Save model after all steps
    opponent.save(modelpath)

In [15]:
env_name = 'l2rpn_wcci_2020'
env = grid2op.make(env_name,
#            action_class=TopologyChangeAndDispatchAction,
           reward_class=CombinedScaledReward)

# Agent 
agent_name = "kaist"
data_dir = os.path.join('kaist_agent/data')
with open(os.path.join(data_dir, 'param.json'), 'r', encoding='utf-8') as f:
    param = json.load(f)
print(param)
state_mean = torch.load(os.path.join(data_dir, 'mean.pt'), map_location=param['device']).cpu()
state_std = torch.load(os.path.join(data_dir, 'std.pt'), map_location=param['device']).cpu()
state_std = state_std.masked_fill(state_std<1e-5, 1.)
state_mean[0, sum(env.observation_space.shape[:20]):] = 0
state_std[0, sum(env.observation_space.shape[:20]):] = 1
agent = Kaist(env, state_mean, state_std, name=agent_name, **param)
agent.sim_trial = 0
agent.load_model(data_dir)

# Opponent 
opponent_name = "D3QN_kaist"
num_pre_training_steps = 256
learning_rate = 1e-4
initial_epsilon = 0.99
final_epsilon = 0.01
decay_epsilon = 10000
attack_period = 20
lines = ['0_4_2', '10_11_11', '11_12_13', '12_13_14', '12_16_20', 
            '13_14_15', '13_15_16', '14_16_17', '14_35_53', '15_16_21', 
            '16_17_22', '16_18_23', '16_21_27', '16_21_28', '16_33_48', 
            '16_33_49', '16_35_54', '17_24_33', '18_19_24', '18_25_35', 
            '19_20_25', '1_10_12', '1_3_3', '1_4_4', '20_21_26', 
            '21_22_29', '21_23_30', '21_26_36', '22_23_31', '22_26_39', 
            '23_24_32', '23_25_34', '23_26_37', '23_26_38', '26_27_40', 
            '26_28_41', '26_30_56', '27_28_42', '27_29_43', '28_29_44', 
            '28_31_57', '29_33_50', '29_34_51', '2_3_0', '2_4_1', 
            '30_31_45', '31_32_47', '32_33_58', '33_34_52', '4_5_55', 
            '4_6_5', '4_7_6', '5_32_46', '6_7_7', '7_8_8', 
            '7_9_9', '8_9_10', '9_16_18', '9_16_19']

opponent = D3QN_Opponent(env.action_space, env.observation_space, lines_attacked=lines, attack_period=attack_period,
            name=opponent_name, is_training=True, learning_rate=learning_rate,
            initial_epsilon=initial_epsilon, final_epsilon=final_epsilon, decay_epsilon=decay_epsilon)

# Training
n_iter = 10000
# Register custom reward for training
cr = env._reward_helper.template_reward
#cr.addReward("overflow", CloseToOverflowReward(), 1.0)
cr.addReward("game", GameplayReward(), 1.0)
#cr.addReward("recolines", LinesReconnectedReward(), 1.0)
cr.addReward("l2rpn", L2RPNReward(), 2.0/float(env.n_line))
# Initialize custom rewards
cr.initialize(env)
# Set reward range to something managable
cr.set_range(-1.0, 1.0)

save_path = "kaist_agent_D3QN_opponent_{}_{}".format(attack_period, n_iter)
log_path="tf_logs_D3QN"

train_adversary(env, agent, opponent, num_pre_training_steps, n_iter, save_path, log_path)

{'head_number': 8, 'n_history': 12, 'state_dim': 128, 'dropout': 0.0, 'sim_trial': 15, 'threshold': 0.35, 'max_low_len': 19, 'danger': 0.9, 'mask': 3, 'mask_hi': 19, 'use_order': True, 'device': 'cpu'}
O: 72 S: 128 A: 108 (19)
['2_3_0' '2_4_1' '0_4_2' '1_3_3' '1_4_4' '4_6_5' '4_7_6' '6_7_7' '7_8_8'
 '7_9_9' '8_9_10' '10_11_11' '1_10_12' '11_12_13' '12_13_14' '13_14_15'
 '13_15_16' '14_16_17' '9_16_18' '9_16_19' '12_16_20' '15_16_21'
 '16_17_22' '16_18_23' '18_19_24' '19_20_25' '20_21_26' '16_21_27'
 '16_21_28' '21_22_29' '21_23_30' '22_23_31' '23_24_32' '17_24_33'
 '23_25_34' '18_25_35' '21_26_36' '23_26_37' '23_26_38' '22_26_39'
 '26_27_40' '26_28_41' '27_28_42' '27_29_43' '28_29_44' '30_31_45'
 '5_32_46' '31_32_47' '16_33_48' '16_33_49' '29_33_50' '29_34_51'
 '33_34_52' '14_35_53' '16_35_54' '4_5_55' '26_30_56' '28_31_57'
 '32_33_58']
Total number of steps: 10256
Step [0] -- Random [0.99]
ATTACK step 257: disconnected 17
ATTACK step 258: disconnected 17
ATTACK step 259: disconnected 


Mean of empty slice.


invalid value encountered in double_scalars



loss = 93657.44
ATTACK step 309: disconnected 44
step 309: Survived [309] steps
Total reward of agent [269.82234835624695]
ATTACK step 313: disconnected 44
step 313: Survived [3] steps
Total reward of agent [1.5816912651062012]
ATTACK step 317: disconnected 40
ATTACK step 318: disconnected 47
ATTACK step 319: disconnected 40
ATTACK step 320: disconnected 47
ATTACK step 321: disconnected 40
ATTACK step 322: disconnected 47
ATTACK step 323: disconnected 40
ATTACK step 324: disconnected 44
step 324: Survived [10] steps
Total reward of agent [7.66756796836853]
ATTACK step 328: disconnected 44
step 328: Survived [3] steps
Total reward of agent [1.5476787090301514]
ATTACK step 332: disconnected 47
ATTACK step 333: disconnected 47
ATTACK step 334: disconnected 47
ATTACK step 335: disconnected 47
ATTACK step 336: disconnected 47
loss = 230619.36
ATTACK step 337: disconnected 47
ATTACK step 338: disconnected 47
ATTACK step 339: disconnected 47
ATTACK step 341: disconnected 47
ATTACK step 342: d

ATTACK step 1136: disconnected 45
ATTACK step 1137: disconnected 45
step 1137: Survived [4] steps
Total reward of agent [2.450249195098877]
ATTACK step 1141: disconnected 45
ATTACK step 1142: disconnected 45
step 1142: Survived [4] steps
Total reward of agent [2.504326343536377]
ATTACK step 1146: disconnected 45
ATTACK step 1147: disconnected 45
step 1147: Survived [4] steps
Total reward of agent [2.446071982383728]
ATTACK step 1151: disconnected 45
ATTACK step 1152: disconnected 45
step 1152: Survived [4] steps
Total reward of agent [2.484434723854065]
ATTACK step 1156: disconnected 45
ATTACK step 1157: disconnected 45
step 1157: Survived [4] steps
Total reward of agent [2.5183258056640625]
ATTACK step 1161: disconnected 45
ATTACK step 1162: disconnected 45
step 1162: Survived [4] steps
Total reward of agent [2.4634045362472534]
ATTACK step 1166: disconnected 45
ATTACK step 1167: disconnected 45
step 1167: Survived [4] steps
Total reward of agent [2.484995722770691]
ATTACK step 1171: 

ATTACK step 1423: disconnected 45
ATTACK step 1424: disconnected 45
step 1424: Survived [4] steps
Total reward of agent [1.5717024207115173]
ATTACK step 1428: disconnected 45
ATTACK step 1429: disconnected 45
step 1429: Survived [4] steps
Total reward of agent [2.4289162158966064]
ATTACK step 1433: disconnected 45
ATTACK step 1434: disconnected 45
step 1434: Survived [4] steps
Total reward of agent [2.432889938354492]
ATTACK step 1438: disconnected 45
ATTACK step 1439: disconnected 45
step 1439: Survived [4] steps
Total reward of agent [2.4178311824798584]
ATTACK step 1443: disconnected 45
ATTACK step 1444: disconnected 45
step 1444: Survived [4] steps
Total reward of agent [2.4381682872772217]
ATTACK step 1448: disconnected 45
ATTACK step 1449: disconnected 45
step 1449: Survived [4] steps
Total reward of agent [2.459296941757202]
ATTACK step 1453: disconnected 45
ATTACK step 1454: disconnected 45
step 1454: Survived [4] steps
Total reward of agent [2.4718363285064697]
loss = 684.8647

ATTACK step 1709: disconnected 45
step 1709: Survived [4] steps
Total reward of agent [2.412395715713501]
ATTACK step 1713: disconnected 45
ATTACK step 1714: disconnected 45
step 1714: Survived [4] steps
Total reward of agent [2.4504783153533936]
ATTACK step 1718: disconnected 45
ATTACK step 1719: disconnected 45
step 1719: Survived [4] steps
Total reward of agent [2.4935693740844727]
ATTACK step 1723: disconnected 45
ATTACK step 1724: disconnected 45
step 1724: Survived [4] steps
Total reward of agent [2.463309645652771]
ATTACK step 1728: disconnected 45
ATTACK step 1729: disconnected 45
step 1729: Survived [4] steps
Total reward of agent [2.4466822147369385]
ATTACK step 1733: disconnected 45
ATTACK step 1734: disconnected 45
step 1734: Survived [4] steps
Total reward of agent [2.397442579269409]
loss = 1249.6381
ATTACK step 1738: disconnected 45
ATTACK step 1739: disconnected 45
step 1739: Survived [4] steps
Total reward of agent [2.4387524127960205]
ATTACK step 1743: disconnected 45

KeyboardInterrupt: 

In [10]:
from grid2op import make
from grid2op.Runner import Runner
from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward

In [11]:
nb_episode = 10 # number of episodes to evaluate
log_path = './logs-evals'
nb_process = 1 # number of cores to use
max_iter = 150 # maximum number of steps per scenario
verbose = True
save_gif = False

In [12]:
env_name = 'l2rpn_wcci_2020'
env = make(env_name, reward_class=L2RPNSandBoxScore,
           other_rewards={
               "reward": L2RPNReward
           })

agent_name = "kaist"
data_dir = os.path.join('kaist_agent/data')
with open(os.path.join(data_dir, 'param.json'), 'r', encoding='utf-8') as f:
    param = json.load(f)

state_mean = torch.load(os.path.join(data_dir, 'mean.pt'), map_location=param['device']).cpu()
state_std = torch.load(os.path.join(data_dir, 'std.pt'), map_location=param['device']).cpu()
state_std = state_std.masked_fill(state_std<1e-5, 1.)
state_mean[0, sum(env.observation_space.shape[:20]):] = 0
state_std[0, sum(env.observation_space.shape[:20]):] = 1
agent = Kaist(env, state_mean, state_std, name=agent_name, **param)
agent.sim_trial = 0
agent.load_model(data_dir)
    
runner_params = env.get_params_for_runner()
runner_params["verbose"] = False
runner = Runner(**runner_params, agentClass=None, agentInstance=agent)
    
res = runner.run(path_save=log_path, nb_episode=nb_episode, nb_process=nb_process, max_iter=150)
if verbose:
    print("Evaluation summary:")
    for _, chron_name, cum_reward, nb_time_step, max_ts in res:
        msg_tmp = "chronics at: {}".format(chron_name)
        msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward)
        msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step,
                                                        max_ts)
        print(msg_tmp)

if save_gif:
    save_log_gif(log_path, res)

O: 72 S: 128 A: 108 (19)
Evaluation summary:
chronics at: Scenario_april_000	total reward: 49896.312500	time steps: 150/150
chronics at: Scenario_april_001	total reward: 53077.667969	time steps: 150/150
chronics at: Scenario_april_002	total reward: 43475.250000	time steps: 150/150
chronics at: Scenario_april_003	total reward: 50524.660156	time steps: 150/150
chronics at: Scenario_april_004	total reward: 55378.964844	time steps: 150/150
chronics at: Scenario_april_005	total reward: 44330.468750	time steps: 150/150
chronics at: Scenario_april_006	total reward: 50290.003906	time steps: 150/150
chronics at: Scenario_april_007	total reward: 62669.691406	time steps: 150/150
chronics at: Scenario_april_008	total reward: 53732.957031	time steps: 150/150
chronics at: Scenario_april_009	total reward: 49542.824219	time steps: 150/150
