In [1]:
import gym
from gym import spaces
import matplotlib.pyplot as plt
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import cv2
from collections import deque
from torch.distributions import Categorical
import math as m
from torch.nn.utils.convert_parameters import vector_to_parameters
from IPython.display import clear_output
from gym.core import ObservationWrapper
from gym.spaces.box import Box
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import math

**Neural Nets**

A3C, TRPO, DQN

Neural Net Architecture are as described in the paper (Playing Atari with       Deep Reinforcement Learning), where
The input to the neural network consits of 84 * 84 * 4 image
The first hidden layer convolves 16 8x8 filters with stride 4
The second hidden layer convolves 32 4x4 filters with stride 2
The final hidden layer is fully connected and consists of 256 units


In [2]:
class A3CLSTMNet(torch.nn.Module):
    
    def __init__(self, num_actions):
        
        super(A3CLSTMNet, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 16, 8, 4),
            nn.ReLU(),
            nn.Conv2d(16, 32, 4, 2),
            nn.ReLU()
        )
        
        self.lstm = nn.LSTMCell(32 * 9 * 9, 256)
        
        self.critic_linear = nn.Linear(256, 1)
        self.actor_linear = nn.Linear(256, num_actions)
        
        self.name = 'a3c_lstm'

    def forward(self, inputs):
        x, (hx, cx) = inputs
        x = torch.FloatTensor(x).view(-1, 1, 84, 84).to(device)
        x = self.conv_layers(x)
        
        x = x.view(-1, 32 * 9 * 9)
        hx, cx = self.lstm(x, (hx, cx))
        x = hx
        value = self.critic_linear(x)
        return F.softmax(self.actor_linear(x), dim=1), (hx, cx)

    def act(self, inputs):
        prob, (hx, cx) = self.forward(inputs)
        categorical = Categorical(prob)
        action = categorical.sample()
        return action, prob, (hx, cx)

class TrpoNet(nn.Module):

  def __init__(self, num_actions):

    super(TrpoNet, self).__init__()
    self.conv_layers = nn.Sequential(
        nn.Conv2d(1, 16, 8, 4),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, 2),
        nn.ReLU()
    )

    self.fc_layers = nn.Sequential(
        nn.Linear(2592, 256),
        nn.ReLU(),
        nn.Linear(256, num_actions)
    )

    self.name = 'trpo'

  def forward(self, x):
      x = torch.FloatTensor(x).view(-1, 1, 84, 84).to(device)
      x = self.conv_layers(x)
      x = self.fc_layers(x.view(-1, 2592))
      output = F.softmax(x,dim=1)
      # Avoid one of the elements equal to 0
      output = output + 1e-6
      output = F.normalize(output, dim=1, p=1)
      return output

  def act(self, input):
      prob = self.forward(input)
      categorical = Categorical(prob)
      action = categorical.sample()
      return action, prob

class DqnNet(nn.Module):

  def __init__(self, num_actions):
    super(DqnNet, self).__init__()
    self.conv_layers = nn.Sequential(
        nn.Conv2d(1, 16, 8, 4),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, 2),
        nn.ReLU()
    )

    self.fc_layers = nn.Sequential(
        nn.Linear(2592, 256),
        nn.ReLU(),
        nn.Linear(256, num_actions)
    )

    self.name = 'dqn'

  def forward(self, x):
      # Convert input to an tensor with size (n, 1, 84, 84)
      x = torch.FloatTensor(x).view(-1, 1, 84, 84).to(device)
      x = self.conv_layers(x)
      x = self.fc_layers(x.view(-1, 2592))
      return x
  
  def act(self, input):
      q_values = self.forward(input)
      action = q_values.max(1)[1]
      # Get softmax of q_values as said in paper
      prob = F.softmax(q_values, dim=1)
      return action, prob

**Environment**

84x84 gray image

Taken from https://github.com/nailo2c/a3c/blob/master/envs.py and deepmind wrappers

In [3]:

def create_atari_env(env_id):
    env = gym.make(env_id)
    env = AtariRescale84x84(env)
    env = NormalizedEnv(env)
    env = EpisodicLifeEnv(env)
    env = MaxAndSkipEnv(env)
    return env




def process_frame84(frame):
    frame = frame[34:34 + 160, :160]
    frame = cv2.resize(frame, (84, 84))
    frame = frame.mean(2)
    frame = frame.astype(np.float32)
    frame *= (1.0 / 255.0)
    return frame



class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env=None):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        super(EpisodicLifeEnv, self).__init__(env)
        self.lives = 0
        self.was_real_done = True
        self.was_real_reset = False

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert somtimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset()
            self.was_real_reset = True
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
            self.was_real_reset = False
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        """Return only every `skip`-th frame"""
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break

        max_frame = np.max(np.stack(self._obs_buffer), axis=0)

        return max_frame, total_reward, done, info

    def reset(self):
        """Clear past frame buffer and init. to first obs. from inner env."""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs
  

class AtariRescale84x84(gym.ObservationWrapper):

    def __init__(self, env=None):
        super(AtariRescale84x84, self).__init__(env)
        self.observation_space = Box(0.0, 1.0, [1, 84, 84])

    def observation(self, observation):
        return process_frame84(observation) 




class NormalizedEnv(gym.ObservationWrapper):

    def __init__(self, env=None):
        super(NormalizedEnv, self).__init__(env)
        self.state_mean = 0
        self.state_std = 0
        self.alpha = 0.9999
        self.num_steps = 0

    def observation(self, observation):
        self.num_steps += 1
        self.state_mean = self.state_mean * self.alpha + \
            observation.mean() * (1 - self.alpha)
        self.state_std = self.state_std * self.alpha + \
            observation.std() * (1 - self.alpha)

        unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
        unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))
        ret = (observation - unbiased_mean) / (unbiased_std + 1e-8)
        return np.expand_dims(ret, axis=0)


class NormalizedState:

    def __init__(self):
        self.state_mean = 0
        self.state_std = 0
        self.alpha = 0.9999
        self.num_steps = 0

    def observation(self, observation):
        self.num_steps += 1
        self.state_mean = self.state_mean * self.alpha + \
            observation.mean() * (1 - self.alpha)
        self.state_std = self.state_std * self.alpha + \
            observation.std() * (1 - self.alpha)

        unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
        unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))
        ret = (observation - unbiased_mean) / (unbiased_std + 1e-8)
        return np.expand_dims(ret, axis=0)

In [4]:
env_id = 'PongNoFrameskip-v4'
env = create_atari_env(env_id)
num_actions = env.action_space.n

In [5]:
dqn_model = DqnNet(num_actions)
dqn_model2 = DqnNet(num_actions)
trpo_model = TrpoNet(num_actions)

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
ac3_path = '/content/gdrive/My Drive/Summer Research/AA on NN/models1/A3C_PongNoFrameskip_v4'
trpo_path = '/content/gdrive/My Drive/Summer Research/AA on NN/models2/TRPO_PongNoFrameSkip_V4_skip4'
dqn_path = '/content/gdrive/My Drive/Summer Research/AA on NN/models2/DQN_PongNoFrameSkip_V4_skip4'
dqn_path2 = '/content/gdrive/My Drive/Summer Research/AA on NN/models1/DQN_PongNoFrameskip_v4'
trpo_model.load_state_dict(torch.load(trpo_path))
dqn_model.load_state_dict(torch.load(dqn_path))
dqn_model2.load_state_dict(torch.load(dqn_path2))
dqn_model2.cuda()
trpo_model.cuda()
dqn_model.cuda()

DqnNet(
  (conv_layers): Sequential(
    (0): Conv2d(1, 16, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(16, 32, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=2592, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=6, bias=True)
  )
)

In [13]:
def calculate_perturbation(state, selected_action, y_dist_weight, eps=0.1, type='l1'):
  # Get the cross entropy loss between the weight of y and selected action
  y_dist_weight = y_dist_weight.view(1, -1)
  criterion = nn.CrossEntropyLoss()
  loss = criterion(y_dist_weight, selected_action)
  grad = torch.autograd.grad(loss, state)[0]
  if(type=='l_inf'):
    #new_state = state + eps * (state.max() - state.min())*  torch.sign(grad)
    perturbation = eps *  torch.sign(grad)
  if(type=='l2'):
    perturbation = eps * math.sqrt(84 * 84) *  (grad / torch.sqrt(grad.pow(2).sum()))
  if(type == 'l1'):
    budget = eps * 84 * 84
    perturbation = grad.pow(10)
    perturbation = perturbation / perturbation.sum() * budget
  if(eps == 0):
    perturbation = 0

  return perturbation

In [14]:
def test_black_box(model, env, type):
    rewards = []
    episodes = 10
    for eps in range(0, 9):
        avg_episode_reward = 0
        print('eps:', eps * 0.3)
        for episode in range(episodes):
          state = env.reset()
          eps_reward = 0
          if(model.name == 'a3c_lstm'):
            cx = torch.zeros(1, 256).to(device)
            hx = torch.zeros(1, 256).to(device)
          while(True):
            if(model.name == 'a3c_lstm'):
              #state = normalizer_1.observation(state)
              state = torch.tensor(state, requires_grad=True)
              action, prob, _ = model.act((state, (hx, cx)))
              perturbated_state = apply_perturbation(state, action, prob, eps * 0.3, type)
              #perturbated_state = normalizer_2.observation(perturbated_state.detach().numpy())
              action, _, (hx, cx) = model.act((perturbated_state, (hx, cx)))
            else:
              #state = normalizer_1.observation(state)
              state = torch.tensor(state, requires_grad=True)
              action, prob = model.act(state)
              #print(torch.autograd.grad(prob.mean(), state))
              perturbated_state = apply_perturbation(state, action, prob, eps * 0.3, type)
              #perturbated_state = normalizer_2.observation(perturbated_state.detach().numpy())
              action, _ = model.act(perturbated_state)
            next_state, reward, done, _ = env.step(action)
            eps_reward += reward
            state = next_state
            if(done):
              state = env.reset()
              avg_episode_reward += eps_reward
              print(eps_reward)
              break
        avg_episode_reward /= episodes
        rewards.append(avg_episode_reward)

    return rewards

In [15]:
def test_white_box(model, adversarial_model, env, type):
    rewards = []
    episodes = 10
    for eps in range(0, 9):
        avg_episode_reward = 0
        print('eps:', eps * 0.3)
        for episode in range(episodes):
          state = env.reset()
          eps_reward = 0
          if(adversarial_model.name == 'a3c_lstm'):
            acx = torch.zeros(1, 256).to(device)
            ahx = torch.zeros(1, 256).to(device)
          if(model.name == 'a3c_lstm'):
            cx = torch.zeros(1, 256).to(device)
            hx = torch.zeros(1, 256).to(device)
          while(True):
            if(adversarial_model.name == 'a3c_lstm'):
              state = torch.tensor(state, requires_grad=True)
              action, prob, (acx, ahx) = adversarial_model.act((state, (acx, ahx)))
              perturbated_state = calculate_perturbation(state, action, prob, eps * 0.3, type) + state
            else:
              state = torch.tensor(state, requires_grad=True)
              action, prob = adversarial_model.act(state)
              perturbated_state = calculate_perturbation(state, action, prob, eps * 0.3, type) + state
            
            if(model.name == 'a3c_lstm'):
              action, _, (hx, cx) = model.act((perturbated_state, (hx, cx)))
            else:
              action, _ = model.act(perturbated_state)

            next_state, reward, done, _ = env.step(action)
            eps_reward += reward
            state = next_state
            if(done):
              state = env.reset()
              avg_episode_reward += eps_reward
              print(eps_reward)
              break
        avg_episode_reward /= episodes
        rewards.append(avg_episode_reward)

    return rewards

In [None]:
rewards, epsilons = test_white_box(dqn_model, trpo_model, env, 'l1')