<a href="https://colab.research.google.com/github/aypan17/robust_pn/blob/main/robust_pn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import grid2op


SyntaxError: ignored

In [None]:
import torch 
import torch.nn as nn
from grid2op.Opponent import BaseOpponent

class DDQN(torch.nn.Module):
    def __init__(self, d_model, num_lines):
        # Embed the state (there are num_lines lines to cut)
        self.emb = nn.Linear(num_lines, d_model)

        # Layer to measure the value of a state
        self.value_stream = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, 1)
        )
        # Layer to measure the advantages of an action given a state
        self.advantage_stream = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )

    # Take environment observation and return Q-values
    def forward(self, obs):
        # Right now I'm only looking at the capacity of each power line when
        # deciding what to attack but this should be changed.
        rho = obs.rho # Capacity of each power line
        line_status = obs.line_status # Whether or not the line is connected

        state = self.emb(torch.tensor(rho))
        values = self.value_stream(state)
        advantages = self.advantage_stream(state)
        qvals = values + (advantages - advantages.mean())
        
        return qvals

class RLOpp(BaseOpponent):

    def __init__(self, env, agent, d_model, num_lines):
        self.env = env
        self.actspace = env.action_space
        self.agent = agent
        self.d_model = d_model 
        self.num_lines = num_lines
        self.budget = 1000 # Random number

        self.policy_net = DDQN(d_model, num_lines)
        self.target_net = DDQN(d_model, num_lines)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

    def attack(self, obs, agent_act, env_act, budget, fail):
        '''
        Takes an observation, agent action, env action, total budget, and if
        the prev attack failed or not and outputs a new attack.

        Params
        ------
        obs: grid2op.Observation.Observation from time t
        agent_act: grid2op.Action.Action from the agent
        env_act: grid2op.Action.Action from the environment
        budget: budget remaining of the opponent. will not attack if over budget
        fail: whether or not the previous attack failed

        Returns
        ------
        attack: grid2op.Action.Action
        ''' 
        rho = obs.rho # Capacity of each power line
        line_status = obs.line_status # Whether or not the line is connected
        state = self.emb(torch.tensor(rho)) # State vector

        eps_threshold = 0.05 #EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY)
        if random.random() > eps_threshold:
            # Exploit
            with torch.no_grad():
                policy_net.eval()
                idx = torch.sort(self.policy_net(state.unsqueeze(0)), descending=True, dim=1)[1]
                policy_net.train()
                
                # Try until we get a valid action
                for line in idx:
                    if line_status[line]:
                        return self.actspace({"set_line_status":[(line, -1)]})

                # This should never happen
                raise ValueError('Invalid state: no possible action')

        else:
            ## Explore
            line = random.choice(np.nonzero(line_status)[0])
            return self.actspace({"set_line_status":[(line, -1)]})

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

        

In [None]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
        
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.stack([s for s in batch.next_state
                                                if s is not None])
        
    state_batch = torch.stack(batch.state)
    action_batch = torch.stack(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
#     print(next_state_values[non_final_mask].shape)
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values.double(), expected_state_action_values.unsqueeze(1).double())
    value_loss = loss.item()
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    
    return value_loss

class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


In [None]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 50
MAX_ITERATIONS = 500

# n - number of targets
n = 4
# m - number of weapons
m = 5
assert n > 1

lower_val = 25
upper_val = 50
lower_prob = 0.6
upper_prob = 0.9
values = np.random.uniform(lower_val, upper_val, n)
prob = np.random.uniform(lower_prob, upper_prob, (m, n))
assignment = generate_initial_assignment(n, m)
env = WTAEnv(assignment, values, prob, device)

policy_net = DuelingDQN(n, m).to(device)
target_net = DuelingDQN(n, m).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), learning_rate=0.0001)
memory = ReplayMemory(10000)

num_episodes = 15
env.reset()
init_state = env.get_state() 
for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    state = init_state
    for t in range(1, MAX_ITERATIONS+1):
        print(f'episode {i_episode}/{num_episodes}, iteration {t}/{MAX_ITERATIONS}', end=' ')
        # Select and perform an action
        action = select_action(state)
        observation, reward, done, _ = env.step(action)
        reward = torch.tensor([reward], device=device)

        if not done:
            next_state = observation
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        loss = optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break
        
        print(f'loss: {loss}', end='\r')
    print()
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        opp.update_target_net()

print()
print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()