In [7]:
import os
import json
import math
import numpy as np
import tensorflow as tf
import torch

import grid2op
from d3qn.adversary import D3QN_Opponent
from grid2op.Agent import DoNothingAgent
from grid2op.Action import TopologyChangeAndDispatchAction
from grid2op.Reward import CombinedScaledReward, L2RPNSandBoxScore, L2RPNReward, GameplayReward
from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg

from kaist_agent.Kaist import Kaist

In [8]:
MAX_TIMESTEP = 7 * 288

def train_adversary(env, agent, opponent, num_pre_training_steps, n_iter, save_path, log_path):
    # Make sure we can fill the experience buffer
    if num_pre_training_steps < opponent.batch_size * opponent.num_frames:
        num_pre_training_steps = opponent.batch_size * opponent.num_frames
        
    # Loop vars
    num_training_steps = n_iter
    num_steps = num_pre_training_steps + num_training_steps
    step = 0
    alive_steps = 0
    total_reward = 0
    done = True
    print(f"Total number of steps: {num_steps}")

    # Create file system related vars
    logpath = os.path.join(log_path, opponent.name)
    os.makedirs(save_path, exist_ok=True)
    modelpath = os.path.join(save_path, opponent.name + ".h5")
    opponent.tf_writer = tf.summary.create_file_writer(logpath, name=opponent.name)
    opponent._save_hyperparameters(save_path, env, num_steps)
    
    while step < num_steps:
        # Init first time or new episode
        if done:
            new_obs = env.reset() # This shouldn't raise
            agent.reset(new_obs)
            opponent.reset(new_obs)
            done = False
        if cfg.VERBOSE and step % 1000 == 0:
            print("Step [{}] -- Random [{}]".format(step, opponent.epsilon))

        # Save current observation to stacking buffer
        opponent._save_current_frame(opponent.state)

        # Execute attack if allowed
        if step <= num_pre_training_steps:
            opponent.remaining_time = 0
            attack, a = opponent._do_nothing, 0
        else:
            attack, a = opponent.attack(new_obs)

        if a != 0:
#             print(f'ATTACK step {step}: disconnected {a}')
            attack_obs, opp_reward, done, info = env.step(attack)
            if info["is_illegal"] or info["is_ambiguous"] or \
               info["is_dispatching_illegal"] or info["is_illegal_reco"]:
                if cfg.VERBOSE:
                    print(attack, info)
            new_obs = attack_obs
            opponent.tell_attack_continues(None, None, None, None)

        while opponent.remaining_time >= 0 and not done:
            new_obs.time_before_cooldown_line[opponent.attack_line] = opponent.remaining_time
            response = agent.act(new_obs, None, None)
            new_obs, reward, done, info = env.step(response)
            opponent.remaining_time -= 1
            total_reward += reward
            alive_steps += 1
        
        # Save new observation to stacking buffer
        new_state = opponent.convert_obs(new_obs)
        opponent._save_next_frame(new_state)

        # Save to experience buffer
        if len(opponent.frames2) == opponent.num_frames:
            opponent.per_buffer.add(np.array(opponent.frames),
                                a, -1 * reward,
                                np.array(opponent.frames2),
                                opponent.done)


        # Perform training when we have enough experience in buffer
        if step >= num_pre_training_steps:
            training_step = step - num_pre_training_steps
            # Decay chance of random action
            opponent.epsilon = opponent._adaptive_epsilon_decay(training_step)

            # Perform training at given frequency
            if step % cfg.UPDATE_FREQ == 0 and \
               len(opponent.per_buffer) >= opponent.batch_size:
                # Perform training
                opponent._batch_train(training_step, step)

                if cfg.UPDATE_TARGET_SOFT_TAU > 0.0:
                    tau = cfg.UPDATE_TARGET_SOFT_TAU
                    # Update target network towards primary network
                    opponent.policy_net.update_target_soft(opponent.target_net.model, tau)

            # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely
            if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \
               step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0:
                opponent.policy_net.update_target_hard(opponent.target_net.model)
        
        if done:
            opponent.epoch_rewards.append(-1 * total_reward)
            opponent.epoch_alive.append(alive_steps)
            if cfg.VERBOSE and step > num_pre_training_steps:
                print("step {}: Agent survived [{}] steps with reward {}".format(step, alive_steps, total_reward))
            alive_steps = 0
            total_reward = 0         
        else:
            alive_steps += 1
            
        ######## After Each Step #######
        if step > 0 and step % 2000 == 0: # save network every 5000 iters
            opponent.save(modelpath)
        step += 1
        # Make new obs the current obs
        opponent.obs = new_obs
        opponent.state = new_state

    # Save model after all steps
    opponent.save(modelpath)

In [9]:
env_name = 'l2rpn_wcci_2020'
env = grid2op.make(env_name, reward_class=CombinedScaledReward)

# Agent 
agent_name = "kaist"
data_dir = os.path.join('kaist_agent/data')
with open(os.path.join(data_dir, 'param.json'), 'r', encoding='utf-8') as f:
    param = json.load(f)
print(param)
state_mean = torch.load(os.path.join(data_dir, 'mean.pt'), map_location=param['device']).cpu()
state_std = torch.load(os.path.join(data_dir, 'std.pt'), map_location=param['device']).cpu()
state_std = state_std.masked_fill(state_std<1e-5, 1.)
state_mean[0, sum(env.observation_space.shape[:20]):] = 0
state_std[0, sum(env.observation_space.shape[:20]):] = 1
agent = Kaist(env, state_mean, state_std, name=agent_name, **param)
agent.sim_trial = 0
agent.load_model(data_dir)

# Opponent 
opponent_name = "D3QN_kaist"
num_pre_training_steps = 256
learning_rate = 5e-5
initial_epsilon = 0.99
final_epsilon = 0.01
decay_epsilon = 5000
attack_period = 20
lines = ['0_4_2', '10_11_11', '11_12_13', '12_13_14', '12_16_20', 
            '13_14_15', '13_15_16', '14_16_17', '14_35_53', '15_16_21', 
            '16_17_22', '16_18_23', '16_21_27', '16_21_28', '16_33_48', 
            '16_33_49', '16_35_54', '17_24_33', '18_19_24', '18_25_35', 
            '19_20_25', '1_10_12', '1_3_3', '1_4_4', '20_21_26', 
            '21_22_29', '21_23_30', '21_26_36', '22_23_31', '22_26_39', 
            '23_24_32', '23_25_34', '23_26_37', '23_26_38', '26_27_40', 
            '26_28_41', '26_30_56', '27_28_42', '27_29_43', '28_29_44', 
            '28_31_57', '29_33_50', '29_34_51', '2_3_0', '2_4_1', 
            '30_31_45', '31_32_47', '32_33_58', '33_34_52', '4_5_55', 
            '4_6_5', '4_7_6', '5_32_46', '6_7_7', '7_8_8', 
            '7_9_9', '8_9_10', '9_16_18', '9_16_19']

opponent = D3QN_Opponent(env.action_space, env.observation_space, lines_attacked=lines, attack_period=attack_period,
            name=opponent_name, is_training=True, learning_rate=learning_rate,
            initial_epsilon=initial_epsilon, final_epsilon=final_epsilon, decay_epsilon=decay_epsilon)

{'head_number': 8, 'n_history': 12, 'state_dim': 128, 'dropout': 0.0, 'sim_trial': 15, 'threshold': 0.35, 'max_low_len': 19, 'danger': 0.9, 'mask': 3, 'mask_hi': 19, 'use_order': True, 'device': 'cpu'}
O: 72 S: 128 A: 108 (19)
['2_3_0' '2_4_1' '0_4_2' '1_3_3' '1_4_4' '4_6_5' '4_7_6' '6_7_7' '7_8_8'
 '7_9_9' '8_9_10' '10_11_11' '1_10_12' '11_12_13' '12_13_14' '13_14_15'
 '13_15_16' '14_16_17' '9_16_18' '9_16_19' '12_16_20' '15_16_21'
 '16_17_22' '16_18_23' '18_19_24' '19_20_25' '20_21_26' '16_21_27'
 '16_21_28' '21_22_29' '21_23_30' '22_23_31' '23_24_32' '17_24_33'
 '23_25_34' '18_25_35' '21_26_36' '23_26_37' '23_26_38' '22_26_39'
 '26_27_40' '26_28_41' '27_28_42' '27_29_43' '28_29_44' '30_31_45'
 '5_32_46' '31_32_47' '16_33_48' '16_33_49' '29_33_50' '29_34_51'
 '33_34_52' '14_35_53' '16_35_54' '4_5_55' '26_30_56' '28_31_57'
 '32_33_58']


In [10]:
# Training
n_iter = 5000
# Register custom reward for training
cr = env._reward_helper.template_reward
#cr.addReward("overflow", CloseToOverflowReward(), 1.0)
cr.addReward("game", GameplayReward(), 1.0)
#cr.addReward("recolines", LinesReconnectedReward(), 1.0)
cr.addReward("l2rpn", L2RPNReward(), 2.0/float(env.n_line))
# Initialize custom rewards
cr.initialize(env)
# Set reward range to something managable
cr.set_range(-1.0, 1.0)

save_path = "kaist_agent_D3QN_opponent_{}_{}".format(attack_period, n_iter)
log_path="tf_logs_D3QN"

train_adversary(env, agent, opponent, num_pre_training_steps, n_iter, save_path, log_path)

Total number of steps: 5256
Step [0] -- Random [0.99]
step 266: Agent survived [622] steps with reward 305.9651598930359
step 272: Agent survived [41] steps with reward 25.828663289546967
step 278: Agent survived [44] steps with reward 27.575585186481476
loss = 109489.51
step 287: Agent survived [68] steps with reward 43.43529826402664
step 302: Agent survived [149] steps with reward 91.74529415369034
step 313: Agent survived [95] steps with reward 67.10616558790207
step 318: Agent survived [23] steps with reward 16.47074830532074
step 329: Agent survived [104] steps with reward 74.5729193687439
loss = 43257.875
step 337: Agent survived [59] steps with reward 42.027507066726685
step 347: Agent survived [83] steps with reward 55.785299479961395
step 360: Agent survived [125] steps with reward 85.89805418252945
step 370: Agent survived [83] steps with reward 61.21436274051666
step 380: Agent survived [86] steps with reward 55.96154886484146
step 386: Agent survived [41] steps with reward

step 1124: Agent survived [11] steps with reward 6.774989366531372
step 1146: Agent survived [236] steps with reward 166.80284082889557
step 1153: Agent survived [47] steps with reward 32.25837832689285
step 1160: Agent survived [47] steps with reward 32.12467420101166
step 1168: Agent survived [62] steps with reward 45.271926164627075
loss = 5933.5986
step 1180: Agent survived [116] steps with reward 85.06474888324738
step 1186: Agent survived [32] steps with reward 21.31094402074814
step 1191: Agent survived [29] steps with reward 18.872675716876984
step 1209: Agent survived [182] steps with reward 131.97413992881775
step 1213: Agent survived [14] steps with reward 6.118500232696533
step 1218: Agent survived [26] steps with reward 14.799134612083435
step 1228: Agent survived [86] steps with reward 62.307415187358856
loss = 13107.638
step 1234: Agent survived [38] steps with reward 26.613938808441162
step 1242: Agent survived [62] steps with reward 43.31673884391785
step 1247: Agent s

step 2057: Agent survived [119] steps with reward 92.7317344546318
step 2068: Agent survived [101] steps with reward 73.76946115493774
loss = 34156.945
step 2074: Agent survived [35] steps with reward 25.109918475151062
step 2083: Agent survived [74] steps with reward 57.56542372703552
step 2094: Agent survived [101] steps with reward 72.20702522993088
step 2099: Agent survived [23] steps with reward 13.899448573589325
step 2108: Agent survived [77] steps with reward 52.176638305187225
step 2116: Agent survived [62] steps with reward 42.98801392316818
loss = 18487.361
step 2130: Agent survived [134] steps with reward 100.60043799877167
step 2141: Agent survived [98] steps with reward 72.81356489658356
step 2149: Agent survived [59] steps with reward 44.62063026428223
step 2171: Agent survived [233] steps with reward 169.5452790260315
loss = 22827.75
step 2186: Agent survived [146] steps with reward 109.46653825044632
step 2190: Agent survived [14] steps with reward 7.380891799926758
st

step 3128: Agent survived [26] steps with reward 16.000807523727417
loss = 9350.234
step 3138: Agent survived [86] steps with reward 61.45883822441101
step 3142: Agent survived [14] steps with reward 5.959876656532288
step 3151: Agent survived [74] steps with reward 55.20153284072876
step 3162: Agent survived [98] steps with reward 70.09688293933868
step 3170: Agent survived [62] steps with reward 39.8563197851181
step 3179: Agent survived [71] steps with reward 51.99316471815109
step 3189: Agent survived [89] steps with reward 64.47385013103485
loss = 9232.734
step 3198: Agent survived [74] steps with reward 52.73938947916031
step 3218: Agent survived [203] steps with reward 146.93529379367828
step 3224: Agent survived [41] steps with reward 29.38334596157074
step 3229: Agent survived [23] steps with reward 16.131144285202026
step 3237: Agent survived [65] steps with reward 45.44643718004227
step 3247: Agent survived [86] steps with reward 63.619648575782776
loss = 8446.254
step 3252:

step 4085: Agent survived [47] steps with reward 32.88774049282074
loss = 8290.16
step 4096: Agent survived [98] steps with reward 71.51432198286057
step 4108: Agent survived [113] steps with reward 82.7127189040184
step 4122: Agent survived [131] steps with reward 96.03501015901566
step 4130: Agent survived [59] steps with reward 43.0027619600296
step 4137: Agent survived [59] steps with reward 39.96928060054779
loss = 5910.961
step 4145: Agent survived [56] steps with reward 39.191299736499786
step 4150: Agent survived [26] steps with reward 17.006691694259644
step 4165: Agent survived [143] steps with reward 101.86745148897171
step 4172: Agent survived [53] steps with reward 36.90305668115616
step 4179: Agent survived [50] steps with reward 31.474450409412384
step 4185: Agent survived [38] steps with reward 24.02320009469986
step 4189: Agent survived [14] steps with reward 7.4701114892959595
loss = 2857.7742
step 4204: Agent survived [153] steps with reward 104.26078885793686
step 4

step 4887: Agent survived [29] steps with reward 17.47747230529785
step 4891: Agent survived [15] steps with reward 6.047279417514801
step 4896: Agent survived [22] steps with reward 14.496838927268982
step 4900: Agent survived [14] steps with reward 8.51778507232666
step 4906: Agent survived [38] steps with reward 21.525287330150604
step 4911: Agent survived [26] steps with reward 15.73215264081955
step 4917: Agent survived [38] steps with reward 25.544349551200867
step 4922: Agent survived [26] steps with reward 17.757930517196655
loss = 57.7219
step 4929: Agent survived [59] steps with reward 32.85921984910965
step 4935: Agent survived [32] steps with reward 16.76658171415329
step 4945: Agent survived [92] steps with reward 53.086640655994415
step 4951: Agent survived [29] steps with reward 16.098089039325714
step 4955: Agent survived [14] steps with reward 8.458327770233154
step 4960: Agent survived [29] steps with reward 13.203466534614563
step 4965: Agent survived [23] steps with

In [11]:
np.insert(np.array([True, True, True]), 0, False, axis=0)

array([False,  True,  True,  True])

In [None]:
from grid2op import make
from grid2op.Runner import Runner
from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward

In [None]:
nb_episode = 10 # number of episodes to evaluate
log_path = './logs-evals'
nb_process = 1 # number of cores to use
max_iter = 150 # maximum number of steps per scenario
verbose = True
save_gif = False

In [None]:
env_name = 'l2rpn_wcci_2020'
env = make(env_name, reward_class=L2RPNSandBoxScore,
           other_rewards={
               "reward": L2RPNReward
           })

agent_name = "kaist"
data_dir = os.path.join('kaist_agent/data')
with open(os.path.join(data_dir, 'param.json'), 'r', encoding='utf-8') as f:
    param = json.load(f)

state_mean = torch.load(os.path.join(data_dir, 'mean.pt'), map_location=param['device']).cpu()
state_std = torch.load(os.path.join(data_dir, 'std.pt'), map_location=param['device']).cpu()
state_std = state_std.masked_fill(state_std<1e-5, 1.)
state_mean[0, sum(env.observation_space.shape[:20]):] = 0
state_std[0, sum(env.observation_space.shape[:20]):] = 1
agent = Kaist(env, state_mean, state_std, name=agent_name, **param)
agent.sim_trial = 0
agent.load_model(data_dir)
    
runner_params = env.get_params_for_runner()
runner_params["verbose"] = False
runner = Runner(**runner_params, agentClass=None, agentInstance=agent)
    
res = runner.run(path_save=log_path, nb_episode=nb_episode, nb_process=nb_process, max_iter=150)
if verbose:
    print("Evaluation summary:")
    for _, chron_name, cum_reward, nb_time_step, max_ts in res:
        msg_tmp = "chronics at: {}".format(chron_name)
        msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward)
        msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step,
                                                        max_ts)
        print(msg_tmp)

if save_gif:
    save_log_gif(log_path, res)