In [1]:
import grid2op
import lightsim2grid
import warnings
import os
import tensorflow as tf

from lightsim2grid.LightSimBackend import LightSimBackend
import numpy as np
from agent import Track2PowerNetAgent

from d3qn.adversary import D3QN_Opponent
from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg

MAX_TIMESTEP = 7 * 288

LINES = ['0_1_0', '0_2_1', '10_11_2', '69_70_3', '23_71_4', '70_71_5',
       '70_72_6', '69_73_7', '69_74_8', '68_74_9', '73_74_10', '75_76_11',
       '68_76_12', '1_11_13', '74_76_14', '76_77_15', '77_78_16',
       '76_79_17', '76_79_18', '78_79_19', '76_81_20', '81_82_21',
       '82_83_22', '82_84_23', '2_11_24', '83_84_25', '84_85_26',
       '84_87_27', '84_88_28', '87_88_29', '88_89_30', '88_89_31',
       '89_90_32', '88_91_33', '88_91_34', '6_11_35', '90_91_36',
       '91_92_37', '91_93_38', '92_93_39', '93_94_40', '79_95_41',
       '81_95_42', '93_95_43', '79_96_44', '79_97_45', '10_12_46',
       '79_98_47', '91_99_48', '93_99_49', '94_95_50', '95_96_51',
       '97_99_52', '98_99_53', '99_100_54', '91_101_55', '100_101_56',
       '11_13_57', '99_102_58', '99_103_59', '102_103_60', '102_104_61',
       '99_105_62', '103_104_63', '104_105_64', '104_106_65',
       '104_107_66', '105_106_67', '12_14_68', '107_108_69', '102_109_70',
       '108_109_71', '109_110_72', '109_111_73', '16_112_74', '31_112_75',
       '31_113_76', '26_114_77', '113_114_78', '13_14_79', '11_116_80',
       '74_117_81', '75_117_82', '11_15_83', '14_16_84', '3_4_85',
       '15_16_86', '16_17_87', '17_18_88', '18_19_89', '14_18_90',
       '19_20_91', '20_21_92', '21_22_93', '22_23_94', '22_24_95',
       '2_4_96', '24_26_97', '26_27_98', '27_28_99', '7_29_100',
       '25_29_101', '16_30_102', '28_30_103', '22_31_104', '30_31_105',
       '26_31_106', '4_5_107', '14_32_108', '18_33_109', '34_35_110',
       '34_36_111', '32_36_112', '33_35_113', '33_36_114', '36_38_115',
       '36_39_116', '29_37_117', '5_6_118', '38_39_119', '39_40_120',
       '39_41_121', '40_41_122', '42_43_123', '33_42_124', '43_44_125',
       '44_45_126', '45_46_127', '45_47_128', '7_8_129', '46_48_130',
       '41_48_131', '41_48_132', '44_48_133', '47_48_134', '48_49_135',
       '48_50_136', '50_51_137', '51_52_138', '52_53_139', '8_9_140',
       '48_53_141', '48_53_142', '53_54_143', '53_55_144', '54_55_145',
       '55_56_146', '49_56_147', '55_57_148', '50_57_149', '53_58_150',
       '3_10_151', '55_58_152', '55_58_153', '54_58_154', '58_59_155',
       '58_60_156', '59_60_157', '59_61_158', '60_61_159', '62_63_160',
       '37_64_161', '4_10_162', '63_64_163', '48_65_164', '48_65_165',
       '61_65_166', '61_66_167', '65_66_168', '46_68_169', '48_68_170',
       '68_69_171', '23_69_172', '7_4_173', '25_24_174', '80_79_175',
       '86_85_176', '115_67_177', '29_16_178', '37_36_179', '62_58_180',
       '63_60_181', '64_65_182', '64_67_183', '67_68_184', '80_67_185']

In [5]:
def train_adversary(env, agent, opponent, num_pre_training_steps, n_iter, save_path, log_path):
    # Make sure we can fill the experience buffer
    if num_pre_training_steps < opponent.batch_size * opponent.num_frames:
        num_pre_training_steps = opponent.batch_size * opponent.num_frames
        
    # Loop vars
    num_training_steps = n_iter
    num_steps = num_pre_training_steps + num_training_steps
    step = 0
    alive_steps = 0
    total_reward = 0
    done = True
    print(f"Total number of steps: {num_steps}")

    # Create file system related vars
    logpath = os.path.join(log_path, opponent.name)
    os.makedirs(save_path, exist_ok=True)
    modelpath = os.path.join(save_path, opponent.name + ".h5")
    opponent.tf_writer = tf.summary.create_file_writer(logpath, name=opponent.name)
    opponent._save_hyperparameters(save_path, env, num_steps)
    
    while step < num_steps:
        # Reset environment and agent
        if done:
            new_obs = env.reset() # This shouldn't raise
            opponent.reset(new_obs)
            done = False
            # use random chronics so that the opponent can learn more scenarios
            max_day = (env.chronics_handler.max_timestep() - MAX_TIMESTEP) // 288
            start_timestep = np.random.randint(max_day) * 288 - 1
            if start_timestep > 0:
                env.fast_forward_chronics(start_timestep)
            
#         if cfg.VERBOSE and step % 1000 == 0:
#             print("Step [{}] -- Random [{}]".format(step, opponent.epsilon))

        # Save current observation to stacking buffer
        opponent._save_current_frame(opponent.state)

        # Execute attack if allowed
        if step <= num_pre_training_steps:
            opponent.remaining_time = 0
            attack, a = opponent._do_nothing, 0
        else:
            attack, a = opponent.attack(new_obs)

        if step > num_pre_training_steps:
#             print(f'ATTACK step {step}: disconnected {a}')
            attack_obs, opp_reward, done, info = env.step(attack)
            if info["is_illegal"] or info["is_ambiguous"]:
                if cfg.VERBOSE:
                    print(attack, info)
            new_obs = attack_obs
            opponent.tell_attack_continues(None, None, None, None)

        while opponent.remaining_time >= 0 and not done:
            new_obs.time_before_cooldown_line[opponent.attack_line] = opponent.remaining_time
            response = agent.act(new_obs, None, None)
            new_obs, reward, done, info = env.step(response)
            if info["is_illegal"] or info["is_ambiguous"]:
                if cfg.VERBOSE:
                    print(attack, info)
            opponent.remaining_time -= 1
            total_reward += reward
            alive_steps += 1
        
        # Save new observation to stacking buffer
        new_state = opponent.convert_obs(new_obs)
        opponent._save_next_frame(new_state)
#         print('reward')
#         print(reward)
#         print('--------------------------------')

        # Save to experience buffer
        if len(opponent.frames2) == opponent.num_frames:
            opponent.per_buffer.add(np.array(opponent.frames),
                                a, -1 * reward + 1500,
                                np.array(opponent.frames2),
                                opponent.done)


        # Perform training when we have enough experience in buffer
        if step >= num_pre_training_steps:
            training_step = step - num_pre_training_steps
            # Decay chance of random action
            opponent.epsilon = opponent._adaptive_epsilon_decay(training_step)

            # Perform training at given frequency
            if step % cfg.UPDATE_FREQ == 0 and \
               len(opponent.per_buffer) >= opponent.batch_size:
                # Perform training
                opponent._batch_train(training_step, step)

                if cfg.UPDATE_TARGET_SOFT_TAU > 0.0:
                    tau = cfg.UPDATE_TARGET_SOFT_TAU
                    # Update target network towards primary network
                    opponent.policy_net.update_target_soft(opponent.target_net.model, tau)

            # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely
            if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \
               step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0:
                opponent.policy_net.update_target_hard(opponent.target_net.model)
        
        if done:
            opponent.epoch_rewards.append(-1 * total_reward)
            opponent.epoch_alive.append(alive_steps)
            if cfg.VERBOSE and step > num_pre_training_steps:
                print("step {}: Agent survived [{}] steps with reward {}".format(step, alive_steps, total_reward))
            alive_steps = 0
            total_reward = 0         
        else:
            alive_steps += 1
            
        ######## After Each Step #######
        if step > 0 and step % 2000 == 0: # save network every 5000 iters
            opponent.save(modelpath)
        step += 1
        # Make new obs the current obs
        opponent.obs = new_obs
        opponent.state = new_state

    # Save model after all steps
    opponent.save(modelpath)

In [6]:
warnings.filterwarnings("ignore")
backend = LightSimBackend()
env = grid2op.make("l2rpn_neurips_2020_track2_small", backend=backend)
warnings.filterwarnings("default")

agent = Track2PowerNetAgent(env.action_space)

# Opponent 
opponent_name = "D3QN_PARL"
num_pre_training_steps = 256
learning_rate = 5e-5
initial_epsilon = 0.99
final_epsilon = 0.01
decay_epsilon = 2000
attack_period = 50
attack_duration = 20
opponent = D3QN_Opponent(env.action_space, env.observation_space, lines_to_attack=LINES, attack_period=attack_period,
            attack_duration=attack_duration, name=opponent_name, is_training=True, learning_rate=learning_rate,
            initial_epsilon=initial_epsilon, final_epsilon=final_epsilon, decay_epsilon=decay_epsilon)

[32m[05-16 19:18:03 MainThread @machine_info.py:91][0m Cannot find available GPU devices, using CPU or other devices now.
[32m[05-16 19:18:03 MainThread @machine_info.py:91][0m Cannot find available GPU devices, using CPU or other devices now.


In [7]:
# Training
n_iter = 2000
save_path = f'PARL_opp_D3QN_shifted_reward_{n_iter}_atk_period_{attack_period}_atk_duration_{attack_duration}'
log_path = f'PARL_opp_D3QN_train_logs'

train_adversary(env, agent, opponent, num_pre_training_steps, n_iter, save_path, log_path)

Total number of steps: 2256
step 273: Agent survived [866] steps with reward 496602.95654296875
step 279: Agent survived [68] steps with reward 66895.93841552734
loss = 256362.62
step 288: Agent survived [137] steps with reward 125195.50421142578
step 299: Agent survived [178] steps with reward 189948.88745117188
step 307: Agent survived [112] steps with reward 96100.87237548828
loss = 1604.537
step 314: Agent survived [90] steps with reward 81372.06658935547
step 324: Agent survived [153] steps with reward 148983.3692626953
loss = 646.7311
step 342: Agent survived [335] steps with reward 365619.17755126953
step 347: Agent survived [46] steps with reward 50465.35925292969
step 355: Agent survived [112] steps with reward 85633.00207519531
loss = 879.93115
step 372: Agent survived [310] steps with reward 263440.87322998047
step 378: Agent survived [68] steps with reward 63721.013244628906
step 384: Agent survived [68] steps with reward 69057.42358398438
step 388: Agent survived [24] step

step 711: Agent survived [24] steps with reward 20012.62823486328
step 715: Agent survived [24] steps with reward 21527.01348876953
step 720: Agent survived [46] steps with reward 50893.91125488281
step 724: Agent survived [21] steps with reward 14774.512573242188
loss = 1253.0491
step 729: Agent survived [49] steps with reward 43392.38171386719
step 737: Agent survived [112] steps with reward 118638.74725341797
step 741: Agent survived [24] steps with reward 22588.123779296875
step 746: Agent survived [46] steps with reward 51702.46875
step 750: Agent survived [24] steps with reward 19174.23065185547
step 755: Agent survived [46] steps with reward 39230.32800292969
loss = 1261.2957
step 761: Agent survived [68] steps with reward 66745.62072753906
step 765: Agent survived [24] steps with reward 22589.52294921875
step 770: Agent survived [46] steps with reward 54404.354736328125
step 774: Agent survived [24] steps with reward 17852.079345703125
step 778: Agent survived [24] steps with r

step 1251: Agent survived [24] steps with reward 12686.731018066406
step 1255: Agent survived [24] steps with reward 13277.059692382812
step 1259: Agent survived [24] steps with reward 14984.247436523438
loss = 100.82916
step 1263: Agent survived [24] steps with reward 15719.5498046875
step 1267: Agent survived [24] steps with reward 16857.93133544922
step 1271: Agent survived [24] steps with reward 13158.110900878906
step 1275: Agent survived [24] steps with reward 13338.95361328125
step 1279: Agent survived [24] steps with reward 15623.898498535156
step 1283: Agent survived [24] steps with reward 16671.721740722656
step 1287: Agent survived [26] steps with reward 19715.890502929688
loss = 81.36693
step 1291: Agent survived [22] steps with reward 11296.850341796875
step 1295: Agent survived [24] steps with reward 12744.759033203125
step 1299: Agent survived [24] steps with reward 13724.693542480469
step 1303: Agent survived [24] steps with reward 15645.008056640625
step 1307: Agent su

loss = 170.04945
step 1736: Agent survived [24] steps with reward 25703.790161132812
step 1740: Agent survived [24] steps with reward 26788.47216796875
step 1744: Agent survived [24] steps with reward 20656.955993652344
step 1748: Agent survived [24] steps with reward 22505.219604492188
step 1752: Agent survived [24] steps with reward 25125.5
step 1756: Agent survived [24] steps with reward 25326.613403320312
step 1760: Agent survived [24] steps with reward 28522.077514648438
loss = 202.56404
step 1764: Agent survived [24] steps with reward 19333.202026367188
step 1768: Agent survived [24] steps with reward 23228.143920898438
step 1772: Agent survived [24] steps with reward 24917.978271484375
step 1776: Agent survived [24] steps with reward 26569.139892578125
step 1780: Agent survived [24] steps with reward 27542.306884765625
step 1784: Agent survived [24] steps with reward 19505.615783691406
step 1788: Agent survived [24] steps with reward 22176.5830078125
loss = 156.6152
step 1792: A

This action will:
	 - NOT change anything to the injections
	 - NOT perform any redispatching action
	 - force disconnection of 1 powerlines ([111])
	 - NOT switch any line status
	 - NOT switch anything in the topology
	 - NOT force any particular bus configuration {'disc_lines': array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, Fals