# Phase5 Data Activation

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision import models
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torchvision.models.feature_extraction import create_feature_extractor

import torchattacks

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
folder_path = ''

In [None]:
model = models.resnet18(pretrained=True)
num_frts = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_frts, 10)
    )

model = model.to(device)

In [5]:
def load_attack(attack_name):
  adv_loader = atk.load(load_path=folder_path+'cifar10_'+str(attack_name)+'.pt', load_predictions=True, load_clean_inputs=True, batch_size=10000)
  adv_images, orig_labels, adv_preds, clean_images = next(iter(adv_loader))

  clean_preds = torch.tensor([])
  correct = 0
  total = 0

  for images, labels in test_loader:
    output = model(images.to(device))
    _, pred = torch.max(output.data, 1)

    total += labels.size(0)
    correct += (pred == labels.to(device)).sum()

    clean_preds = torch.cat((clean_preds, pred.cpu().data), 0)

  print('Standard accuracy: %.2f %%' % (100 * float(correct) / total))

  adv_correct = 0
  adv_total = 0

  for images, labels, _, _ in adv_loader:
    outputs = model(images.to(device))
    _, pred = torch.max(outputs.data, 1)

    adv_total += labels.size(0)
    adv_correct += (pred == labels.to(device)).sum()

  print('Robust accuracy: %.2f %%' % (100 * float(adv_correct) / adv_total))

  return orig_labels, clean_images, clean_preds, adv_images, adv_preds

In [6]:
def generate_success_attack(orig_labels, clean_images, clean_preds, adv_images, adv_preds, attack_name):
  images = torch.tensor([])
  labels = torch.tensor([])
  i = 0

  for num in range(len(orig_labels)):
    if clean_preds[num] != adv_preds[num]:
      clean_image_num = clean_images[num].unsqueeze(0)
      clean_pred_num = torch.zeros(1)
      adv_image_num = adv_images[num].unsqueeze(0)
      adv_pred_num = torch.ones(1)

      if orig_labels[num] == clean_preds[num] and orig_labels[num] != adv_preds[num]:
        i += 1
        images = torch.cat((images, clean_image_num, adv_image_num), 0)
        labels = torch.cat((labels, clean_pred_num, adv_pred_num), 0)

  print("Successful attack: ", i)
  print(images.shape)
  print(labels.shape)

  np.save(folder_path + 'images_'+str(attack_name)+'.npy', images)
  np.save(folder_path + 'labels_'+str(attack_name)+'.npy', labels)

In [None]:
# common critical neuron
def load_common(alpha, beta):
  common_layer = []

  normal = np.load(folder_path+'normal_'+str(alpha)+'_'+str(beta)+'.npy', allow_pickle=True)
  adversarial = np.load(folder_path+'adversarial_'+str(alpha)+'_'+str(beta)+'.npy', allow_pickle=True)

  for i in range(len(normal)):
    if np.sum(normal[i]) > 0 or np.sum(adversarial[i]) > 0:
      common_layer.append(i)
      globals()['common_'+str(i)] = np.where(np.add(normal[i], adversarial[i]) > 0, 1, 0)
      print("layer:", i, np.sum(normal[i]), np.sum(adversarial[i]), np.sum(globals()['common_'+str(i)]))

  return common_layer

In [8]:
def get_inter_model(common_layer, all_layer):
  return_nodes = {}
  if len(common_layer) > 0:
    for i in common_layer:
      return_nodes[all_layer[i]] = str(i)

  inter_model = create_feature_extractor(model, return_nodes=return_nodes)

  return inter_model

In [9]:
def generate_activation(inter_model, common_layer, all_image, all_label, attack_name, alpha, beta):
  activation_x = np.empty(len(all_image))
  activation_y = np.empty(len(all_image))

  inter_output = inter_model(torch.Tensor(all_image).to(device))

  for i in common_layer:
    out = np.empty(len(all_image))
    if i == 0 or i == 1:
      with_zero_out = np.asarray(torch.flatten(torch.Tensor(np.multiply(all_image, globals()['common_'+str(i)])), start_dim=1))
    else:
      with_zero_out = np.asarray(torch.flatten(torch.Tensor(np.multiply(np.array(inter_output[str(i)].tolist()), globals()['common_'+str(i)])), start_dim=1))

    for num in range(len(with_zero_out)):
      if num == 0:
        out = np.expand_dims(with_zero_out[num][np.nonzero(globals()['common_'+str(i)].flatten())], axis=0)
      else:
        out = np.concatenate((out, np.expand_dims(with_zero_out[num][np.nonzero(globals()['common_'+str(i)].flatten())], axis=0)), axis=0)

    if i == common_layer[0]:
      activation_x = out
    else:
      activation_x = np.concatenate((activation_x, out), axis=1)
  
  activation_y = all_label
  print(activation_x.shape, activation_y.shape)

  np.save(folder_path+'activation_x_'+str(attack_name)+'_'+str(alpha)+'_'+str(beta)+'.npy', activation_x)
  np.save(folder_path+'activation_y_'+str(attack_name)+'_'+str(alpha)+'_'+str(beta)+'.npy', activation_y)

In [None]:
attack_name = ['pgd','deepfool','fgsm']
all_layer = ['', '', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'avgpool']
alpha = 0.95
test_beta = [0.5, 0.6, 0.7, 0.8, 0.9]

for attack in attack_name:
  print(attack)
  if attack == 'pgd':
    atk = torchattacks.PGD(model, eps=8/255, alpha=2/225, steps=100, random_start=True)
  elif attack == 'deepfool':
    atk = torchattacks.DeepFool(model, steps=50, overshoot=0.02)
  elif attack == 'fgsm':
    atk = torchattacks.FGSM(model, eps=8/255)
  else:
    print("Error method")

  if os.path.isfile(folder_path+'cifar10_'+str(attack)+'.pt'):
    images = np.load(folder_path + 'images_'+str(attack)+'.npy')
    labels = np.load(folder_path + 'labels_'+str(attack)+'.npy')

    for beta in test_beta:
      print("alpha: ", alpha, "beta: ", beta)
      common = load_common(alpha, beta)
      intermediate_model = get_inter_model(common, all_layer)
      generate_activation(intermediate_model, common, images, labels, attack, alpha, beta)

  else:
    atk.save(data_loader=test_loader, save_path=folder_path+'cifar10_'+str(attack)+'.pt', save_predictions=True, save_clean_inputs=True)
    o_labels, c_images, c_preds, a_images, a_preds = load_attack(attack)
    generate_success_attack(o_labels, c_images, c_preds, a_images, a_preds, attack)

    images = np.load(folder_path + 'images_'+str(attack)+'.npy')
    labels = np.load(folder_path + 'labels_'+str(attack)+'.npy')

    for beta in test_beta:
      print("alpha: ", alpha, "beta: ", beta)
      common = load_common(alpha, beta)
      intermediate_model = get_inter_model(common, all_layer)
      generate_activation(intermediate_model, common, images, labels, attack, alpha, beta)