In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
! pip install torchattacks
! pip install adversarial-robustness-toolbox==1.8.1

In [3]:
import gc
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import torchattacks

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader, Dataset
from torch.hub import load_state_dict_from_url
from torchvision import datasets, models, transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau, ExponentialLR
from torch.autograd import Variable

from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm.notebook import tqdm

In [4]:
PGD_ATTACK_EPS = 2/255

### Base Classifier - ResNet

In [194]:
def _weights_init(m):
    classname = m.__class__.__name__
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        init.kaiming_normal_(m.weight)

class LambdaLayer(nn.Module):
    def __init__(self, lambd):
      super(LambdaLayer, self).__init__()
      self.lambd = lambd

    def forward(self, x):
      return self.lambd(x)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, option='A'):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            if option == 'A':
                """
                For CIFAR10 ResNet paper uses option A.
                """
                self.shortcut = LambdaLayer(lambda x:
                                            F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0))
            elif option == 'B':
                self.shortcut = nn.Sequential(
                     nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                     nn.BatchNorm2d(self.expansion * planes)
                )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 16

        self.in_channels = 1 # 3 for CIFAR OR 1 for MNIST
        self.conv1 = nn.Conv2d(self.in_channels, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
        self.linear = nn.Linear(64, num_classes)

        self.apply(self._weights_init)

    def _weights_init(self, m):
        classname = m.__class__.__name__
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
          nn.init.kaiming_normal_(m.weight)
    
    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion

        return nn.Sequential(*layers)

    def forward(self, x, return_interm_layer=None):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        if return_interm_layer == 1:
            return out
        out = self.layer2(out)
        if return_interm_layer == 2:
            return out
        out = self.layer3(out)
        if return_interm_layer == 3:
            return out
        out = F.avg_pool2d(out, out.size()[3])
        out = out.view(out.size(0), -1)
        if return_interm_layer == -1:
            return out
        out = self.linear(out)
        
        return out

def resnet20():
    return ResNet(BasicBlock, [3, 3, 3])

def resnet32():
    return ResNet(BasicBlock, [5, 5, 5])

def resnet44():
    return ResNet(BasicBlock, [7, 7, 7])

def resnet56():
    return ResNet(BasicBlock, [9, 9, 9])

def resnet110():
    return ResNet(BasicBlock, [18, 18, 18])

def resnet1202():
    return ResNet(BasicBlock, [200, 200, 200])

### SubNetwork

In [195]:
class SubNet(nn.Module):
    def __init__(self, in_channels):
        super(SubNet, self).__init__()
        conv1 = nn.Conv2d(in_channels, 96, kernel_size=3, stride=1, padding=0, bias=False)
        bn1 = nn.BatchNorm2d(96)
        conv2 = nn.Conv2d(96, 192, kernel_size=3, stride=1, padding=0, bias=False)
        bn2 = nn.BatchNorm2d(192)
        conv3 = nn.Conv2d(192, 192, kernel_size=3, stride=1, padding=0, bias=False)
        bn3 = nn.BatchNorm2d(192)
        conv4 = nn.Conv2d(192, 2, kernel_size=1, stride=1, padding=0, bias=False)
        bn4 = nn.BatchNorm2d(2)
        relu = nn.ReLU(inplace=True)
        avgpool = nn.AdaptiveAvgPool2d((1, 1))
        flatten = nn.Flatten()
        linear = nn.Linear(2, 1)
        sigmoid = nn.Sigmoid()

        self.layers = nn.Sequential(
            conv1,
            bn1,
            relu,
            conv2,
            bn2,
            relu,
            conv3,
            bn3,
            conv4,
            bn4,
            relu,
            avgpool,
            flatten,
            linear,
            sigmoid,
        )

        self.layers.apply(self.init_param)

    def forward(self, x):
        for itr, layer in enumerate(self.layers):
          x = layer(x)

        return x

    def init_param(self, param):
        if type(param) in [nn.Linear, nn.Conv2d]:
            nn.init.kaiming_uniform_(param.weight)

### Train and Test Loops

In [196]:
!ls '/content/drive/MyDrive/11785 - Project/data'

benign_cifar.npy	      fgsm_mnist_eps0.5.npy
benign_cifar_train.npy	      pgd_cifar_default_art.npy
benign_mnist.npy	      pgd_cifar_default_torchattacks_new.npy
benign_mnist_train.npy	      pgd_cifar_eps0.1_torchattacks.npy
cwlinf_cifar_default_art.npy  pgd_cifar_eps0.3_alpha0.1_steps7.npy
cwlinf_default_art.npy	      pgd_mnist_default_art.npy
cwlinf_mnist_default_art.npy  pgd_mnist_eps0.3_alpha0.1_steps7.npy
fgsm_cifar_default_art.npy


In [197]:
batch_size = 64

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

unattacked_data_path = "/content/drive/MyDrive/11785 - Project/data/benign_mnist.npy"
unattacked_data = np.load(unattacked_data_path, allow_pickle=True).astype(float)

# unattacked_data = torch.from_numpy(unattacked_data.transpose(0, 3, 1, 2))
unattacked_data = torch.from_numpy(unattacked_data)
print(f"unattacked data shape: {unattacked_data.shape}")

# # pre-processing
# unattacked_data = Normalize0to1(unattacked_data)
# unattacked_data = normalize(unattacked_data)

# train-test split
unattacked_train_data = unattacked_data[:9000]
unattacked_test_data = unattacked_data[9000:]



attacked_data_path = "/content/drive/MyDrive/11785 - Project/data/cwlinf_mnist_default_art.npy"
# attacked_data_path = "/content/drive/MyDrive/11785 - Project/data/cwlinf_cifar_default_art.npy"
# attacked_data_path = "/content/drive/MyDrive/11785 - Project/data/fgsm_cifar_default_art.npy"
# attacked_data_path = "/content/drive/MyDrive/11785 - Project/data/pgd_cifar_default_torchattacks_new.npy"
attacked_data = np.load(attacked_data_path, allow_pickle=True).astype(float)
# attacked_data = torch.from_numpy(attacked_data.transpose(1, 0, 2, 3))
attacked_data = torch.from_numpy(attacked_data)
print(f"attacked data shape: {attacked_data.shape}")

# # pre-processing
# attacked_data = Normalize0to1(attacked_data)
# attacked_data = normalize(attacked_data)

# train-test split
attacked_train_data = attacked_data[:9000]
attacked_test_data = attacked_data[9000:]

unattacked data shape: torch.Size([10000, 1, 28, 28])
attacked data shape: torch.Size([10000, 1, 28, 28])


In [198]:
!ls '/content/drive/MyDrive/11785 - Project/'

AdversarialDetection.pdf  imgs
cifar10_model.pth	  mnist_model.pth
data			  mnist-resnet-dynamic-adv-trained-model.pt
Experiments.gsheet	  Presentation.gslides


In [199]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

resnet_model = resnet32()
resnet_model.to(device)
checkpoint = torch.load("/content/drive/MyDrive/11785 - Project/mnist-resnet-dynamic-adv-trained-model.pt")
# mod_checkpoint = {k.replace("module.", ""): v for k, v in checkpoint['state_dict'].items()}
resnet_model.load_state_dict(checkpoint["model_state_dict"])
resnet_optimizer = torch.optim.SGD(resnet_model.parameters(), lr=0.1, weight_decay=5e-5, momentum=0.9)
resnet_criterion = nn.CrossEntropyLoss()
resnet_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(resnet_optimizer, T_0=10, T_mult=2, eta_min=0.01, last_epoch=-1)

# output size at diff intermediate layers of resnet
interm_layer2dim = {1: 16, 2: 32, 3: 64}
interm_layer = 2

subnet_model = SubNet(interm_layer2dim[interm_layer])
subnet_model.to(device)
subnet_optimizer = torch.optim.Adam(subnet_model.parameters(), lr=0.01, betas=(0.99, 0.999))
subnet_criterion = nn.BCELoss()
subnet_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(subnet_optimizer, T_0=10, T_mult=2, eta_min=0.01, last_epoch=-1)

In [200]:
def statically_train_subnet(
    resnet_model,
    interm_layer,
    subnet_model,
    subnet_optimizer,
    subnet_criterion,
    subnet_scheduler,
    unattacked_train_data,
    unattacked_test_data,
    attacked_train_data,
    attacked_test_data,
    device,
    epochs=100,
    batch_size=64,
):
    subnet_model.train()
    resnet_model.eval()
    batches = []

    for epoch in range(epochs):

      avg_loss = 0.0
      for batch_itr in tqdm(range(0, len(unattacked_train_data), batch_size)):
          attacked_input = attacked_train_data[batch_itr:batch_itr+batch_size]
          attacked_labels = torch.ones((attacked_input.shape[0], 1), dtype=torch.float32)
          unattacked_input = unattacked_train_data[batch_itr:batch_itr+batch_size]
          unattacked_labels = torch.zeros((unattacked_input.shape[0], 1), dtype=torch.float32)

          input = torch.cat((unattacked_input, attacked_input), axis=0)
          labels = torch.cat((unattacked_labels, attacked_labels), axis=0)

          assert input.shape[0] == labels.shape[0]
          shuffle_indices = np.arange(input.shape[0])
          np.random.shuffle(shuffle_indices)
          input, labels = input[shuffle_indices].squeeze(0), labels[shuffle_indices].squeeze(0)

          input, labels = input.to(device), labels.to(device)

          input = resnet_model(input.float(), return_interm_layer=interm_layer)
          output = subnet_model(input)

          loss = subnet_criterion(output, labels)
          loss.backward()
          avg_loss += loss.item()

          subnet_optimizer.step()

          del input
          del labels
          del loss
          torch.cuda.empty_cache()

      val_loss, val_acc, val_roc = statically_test_subnet(
          resnet_model,
          interm_layer,
          subnet_model,
          subnet_criterion,
          unattacked_test_data,
          attacked_test_data,
          device,
      )

      print('Val Loss: {:.4f} | Val Accuracy: {:.4f} | Val ROC: {:.4f}'.format(val_loss, val_acc, val_roc))
      torch.save({
          'epoch': epoch,
          'model_state_dict': resnet_model.state_dict(),
          'optimizer_state_dict': subnet_optimizer.state_dict(),
      }, './' + str(epoch) + 'model.pt')

In [201]:
def statically_test_subnet(
    resnet_model,
    interm_layer,
    subnet_model,
    criterion,
    unattacked_test_data,
    attacked_test_data,
    device,
    batch_size=64,
):
    resnet_model.eval()
    subnet_model.eval()
    test_loss = []
    accuracies = []
    roc_scores = []

    for batch_itr in tqdm(range(0, len(unattacked_test_data), batch_size)):
        attacked_input = attacked_test_data[batch_itr:batch_itr+batch_size]
        attacked_labels = torch.ones((attacked_input.shape[0], 1), dtype=torch.float32)
        unattacked_input = unattacked_test_data[batch_itr:batch_itr+batch_size]
        unattacked_labels = torch.zeros((unattacked_input.shape[0], 1), dtype=torch.float32)

        input = torch.cat((unattacked_input, attacked_input), axis=0)
        labels = torch.cat((unattacked_labels, attacked_labels), axis=0)

        assert input.shape[0] == labels.shape[0]
        shuffle_indices = np.arange(input.shape[0])
        np.random.shuffle(shuffle_indices)
        input, labels = input[shuffle_indices].squeeze(0), labels[shuffle_indices].squeeze(0)

        input, labels = input.to(device), labels.to(device)

        with torch.no_grad():
            input = resnet_model(input.float(), return_interm_layer=interm_layer)
            output = subnet_model(input)

        pred_labels = (output > 0.5).float()
        loss = criterion(output, labels)

        # print()
        # print(pred_labels.flatten())
        # print(labels.flatten())
        # print()

        accuracy = accuracy_score(labels.flatten().cpu(), pred_labels.flatten().cpu())
        roc_score = roc_auc_score(labels.flatten().cpu(), pred_labels.flatten().cpu())
        accuracies.append(accuracy)
        roc_scores.append(roc_score)
        test_loss.extend([loss.item()]*input.size()[0])
        
        del input
        del labels
        del loss
        torch.cuda.empty_cache()

    subnet_model.train()

    return np.mean(test_loss), np.mean(accuracies), np.mean(roc_scores)

In [202]:
def dynamically_train_subnet(
    resnet_model,
    interm_layer,
    subnet_model,
    subnet_optimizer,
    subnet_criterion,
    subnet_scheduler,
    benign_train_imgs,
    benign_test_imgs,
    device,
    epochs=100,
    batch_size=64,
):
    subnet_model.train()
    resnet_model.eval()

    best_loss = float('inf')
    for epoch in range(epochs):
      
      avg_loss = 0.0
      for batch_itr in tqdm(range(0, len(benign_train_imgs), batch_size)):

        # get resnet outputs of benign imgs and labels as '0'
        benign_inputs = benign_train_imgs[batch_itr:batch_itr+batch_size].float().to(device)
        benign_inputs = resnet_model(benign_inputs, return_interm_layer=interm_layer)
        benign_train_labels = torch.zeros((benign_inputs.shape[0], 1), dtype=torch.float32)

        # create a attack instance using current state of the subnet
        train_attack = torchattacks.PGD(subnet_model, eps=4/255, alpha=1/255, steps=40)
        adv_inputs = train_attack(benign_inputs, benign_train_labels)
        adv_train_labels = torch.ones((adv_inputs.shape[0], 1), dtype=torch.float32)

        # create a 2x batch with adv and benign images
        input = torch.cat((benign_inputs, adv_inputs), axis=0)
        labels = torch.cat((benign_train_labels, adv_train_labels), axis=0)

        # shuffle the combined inputs and labels
        assert input.shape[0] == labels.shape[0]
        shuffle_indices = np.arange(input.shape[0])
        np.random.shuffle(shuffle_indices)
        input, labels = input[shuffle_indices].squeeze(0), labels[shuffle_indices].squeeze(0)

        # feed to subnet
        input, labels = input.to(device), labels.to(device)
        output = subnet_model(input)

        # calculate loss
        loss = subnet_criterion(output, labels)
        loss.backward()
        avg_loss += loss.item()

        # param update
        subnet_optimizer.step()

        # cleanup
        del input
        del labels
        del loss
        torch.cuda.empty_cache()

      avg_loss /= len(benign_train_imgs)

      val_loss, val_acc, val_roc = dynamically_test_subnet(
          resnet_model,
          interm_layer,
          subnet_model,
          subnet_criterion,
          benign_test_imgs,
          device,
      )

      print('Train Loss: {:.4f} | Val Loss: {:.4f} | Val Accuracy: {:.4f} | Val ROC: {:.4f}'.format(avg_loss, val_loss, val_acc, val_roc))
      if val_loss > best_loss:
        best_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': subnet_model.state_dict(),
            'optimizer_state_dict': subnet_optimizer.state_dict(),
        }, './' + str(epoch) + 'model.pt')

      subnet_scheduler.step(val_loss)

In [203]:
def dynamically_test_subnet(
    resnet_model,
    interm_layer,
    subnet_model,
    subnet_criterion,
    benign_test_imgs,
    device,
    batch_size=64,
    threshold=0.5,
):
    resnet_model.eval()
    subnet_model.eval()
    test_loss = []
    accuracies = []
    roc_scores = []

    for batch_itr in tqdm(range(0, len(benign_test_imgs), batch_size)):

        # get resnet outputs of benign imgs and labels as '0'
        benign_inputs = benign_test_imgs[batch_itr:batch_itr+batch_size].float().to(device)
        benign_inputs = resnet_model(benign_inputs, return_interm_layer=interm_layer)
        benign_test_labels = torch.zeros((benign_inputs.shape[0], 1), dtype=torch.float32)
        
        # create a attack instance using current state of the subnet
        val_attack = torchattacks.PGD(subnet_model, eps=2/255, alpha=1/255, steps=40)
        adv_inputs = val_attack(benign_inputs, benign_test_labels)
        adv_test_labels = torch.ones((adv_inputs.shape[0], 1), dtype=torch.float32)

        # create a 2x batch with adv and benign images
        input = torch.cat((benign_inputs, adv_inputs), axis=0)
        labels = torch.cat((benign_test_labels, adv_test_labels), axis=0)

        # shuffle the combined inputs and labels
        assert input.shape[0] == labels.shape[0]
        shuffle_indices = np.arange(input.shape[0])
        np.random.shuffle(shuffle_indices)
        input, labels = input[shuffle_indices].squeeze(0), labels[shuffle_indices].squeeze(0)

        # feed to subnet
        input, labels = input.to(device), labels.to(device)
        with torch.no_grad():
            output = subnet_model(input)

        # get labels based on threshold and calculate loss
        pred_labels = (output > threshold).float()
        loss = subnet_criterion(output, labels)

        # calculate metrics
        accuracy = accuracy_score(labels.flatten().cpu(), pred_labels.flatten().cpu())
        accuracies.append(accuracy)
        roc_score = roc_auc_score(labels.flatten().cpu(), pred_labels.flatten().cpu())
        roc_scores.append(roc_score)
        test_loss.extend([loss.item()]*input.size()[0])
        
        # cleanup
        del input
        del labels
        del loss
        torch.cuda.empty_cache()

    subnet_model.train()

    return np.mean(test_loss), np.mean(accuracies), np.mean(roc_scores)

### Read Benign Data

In [204]:
!ls '/content/drive/MyDrive/11785 - Project/data'

benign_cifar.npy	      fgsm_mnist_eps0.5.npy
benign_cifar_train.npy	      pgd_cifar_default_art.npy
benign_mnist.npy	      pgd_cifar_default_torchattacks_new.npy
benign_mnist_train.npy	      pgd_cifar_eps0.1_torchattacks.npy
cwlinf_cifar_default_art.npy  pgd_cifar_eps0.3_alpha0.1_steps7.npy
cwlinf_default_art.npy	      pgd_mnist_default_art.npy
cwlinf_mnist_default_art.npy  pgd_mnist_eps0.3_alpha0.1_steps7.npy
fgsm_cifar_default_art.npy


In [212]:
batch_size = 64

# benign data
benign_imgs_path = "/content/drive/MyDrive/11785 - Project/data/pgd_mnist_default_art.npy"
benign_imgs = np.load(benign_imgs_path, allow_pickle=True).astype(float)
benign_imgs = torch.from_numpy(benign_imgs) # .transpose(0, 3, 1, 2))
shuffle_indices = np.arange(benign_imgs.shape[0])
np.random.shuffle(shuffle_indices)
benign_imgs = benign_imgs[shuffle_indices]
print(f"benign data shape: {benign_imgs.shape}")

# train-test split
split_idx = 9000
benign_train_imgs = benign_imgs[:split_idx]
benign_test_imgs = benign_imgs[split_idx:]

benign data shape: torch.Size([10000, 1, 28, 28])


### Load Base Model - ResNet

In [213]:
! ls "/content/drive/MyDrive/11785 - Project/"

AdversarialDetection.pdf  imgs
cifar10_model.pth	  mnist_model.pth
data			  mnist-resnet-dynamic-adv-trained-model.pt
Experiments.gsheet	  Presentation.gslides


In [214]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

resnet_model = resnet32()
resnet_model.to(device)
# checkpoint = torch.load("/content/drive/MyDrive/11785 - Project/resnet-adv-trained-model.pt", map_location=device)
checkpoint = torch.load("/content/drive/MyDrive/11785 - Project/mnist-resnet-dynamic-adv-trained-model.pt", map_location=device)
resnet_model.load_state_dict(checkpoint["model_state_dict"])
# resnet_model.load_state_dict(checkpoint)

<All keys matched successfully>

### Create SubNet

In [215]:
# output size at diff intermediate layers of resnet
interm_layer2dim = {1: 16, 2: 32, 3: 64}
interm_layer = 2

subnet_model = SubNet(interm_layer2dim[interm_layer])
subnet_model.to(device)
subnet_optimizer = torch.optim.Adam(subnet_model.parameters(), lr=0.0001, betas=(0.99, 0.999))
subnet_criterion = nn.BCEWithLogitsLoss()
subnet_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(subnet_optimizer, 'min', factor=0.75, patience=1)

### Run Training

In [216]:
dynamically_train_subnet(
  resnet_model,
  interm_layer,
  subnet_model,
  subnet_optimizer,
  subnet_criterion,
  subnet_scheduler,
  benign_train_imgs,
  benign_test_imgs,
  device,
  epochs=5,
  batch_size=64,
)

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 0.0104 | Val Loss: 0.6585 | Val Accuracy: 1.0000 | Val ROC: 1.0000


  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 0.0103 | Val Loss: 0.6563 | Val Accuracy: 1.0000 | Val ROC: 1.0000


  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 0.0103 | Val Loss: 0.6531 | Val Accuracy: 1.0000 | Val ROC: 1.0000


  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 0.0102 | Val Loss: 0.6495 | Val Accuracy: 1.0000 | Val ROC: 1.0000


  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 0.0102 | Val Loss: 0.6459 | Val Accuracy: 1.0000 | Val ROC: 1.0000


In [None]:
statically_train_subnet(
    resnet_model,
    interm_layer,
    subnet_model,
    subnet_optimizer,
    subnet_criterion,
    subnet_scheduler,
    unattacked_train_data,
    unattacked_test_data,
    attacked_train_data,
    attacked_test_data,
    device,
    epochs=5,
    batch_size=64,
)

### Combined PGD Attack

In [217]:
from art.attacks.evasion import FastGradientMethod, ProjectedGradientDescentPyTorch, CarliniLInfMethod, SaliencyMapMethod, DeepFool
from art.estimators.classification import PyTorchClassifier
from art.utils import load_mnist, load_cifar10
from art.utils import load_dataset

In [225]:
# benign = load_cifar10()
benign = load_mnist()
# (x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_cifar10()
(x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_mnist()

In [226]:
class AttackCombined(object):
    r"""
    Base class for all attacks.
    .. note::
        It automatically set device to the device where given model is.
        It basically changes training mode to eval during attack process.
        To change this, please see `set_training_mode`.
    """
    def __init__(self, name, model, detector_model):
        r"""
        Initializes internal attack state.
        Arguments:
            name (str): name of attack.
            model (torch.nn.Module): model to attack.
        """

        self.attack = name
        self.model = model
        self.detector = detector_model
        self.model_name = str(model).split("(")[0]
        self.device = next(model.parameters()).device

        self._attack_mode = 'default'
        self._targeted = False
        self._return_type = 'float'
        self._supported_mode = ['default']

        self._model_training = False
        self._batchnorm_training = False
        self._dropout_training = False

    def forward(self, *input):
        r"""
        It defines the computation performed at every call.
        Should be overridden by all subclasses.
        """
        raise NotImplementedError

    def get_mode(self):
        r"""
        Get attack mode.
        """
        return self._attack_mode

    def set_mode_default(self):
        r"""
        Set attack mode as default mode.
        """
        self._attack_mode = 'default'
        self._targeted = False
        print("Attack mode is changed to 'default.'")

    def set_mode_targeted_by_function(self, target_map_function=None):
        r"""
        Set attack mode as targeted.
        Arguments:
            target_map_function (function): Label mapping function.
                e.g. lambda images, labels:(labels+1)%10.
                None for using input labels as targeted labels. (Default)
        """
        if "targeted" not in self._supported_mode:
            raise ValueError("Targeted mode is not supported.")

        self._attack_mode = 'targeted'
        self._targeted = True
        self._target_map_function = target_map_function
        print("Attack mode is changed to 'targeted.'")

    def set_mode_targeted_least_likely(self, kth_min=1):
        r"""
        Set attack mode as targeted with least likely labels.
        Arguments:
            kth_min (str): label with the k-th smallest probability used as target labels. (Default: 1)
        """
        if "targeted" not in self._supported_mode:
            raise ValueError("Targeted mode is not supported.")

        self._attack_mode = "targeted(least-likely)"
        self._targeted = True
        self._kth_min = kth_min
        self._target_map_function = self._get_least_likely_label
        print("Attack mode is changed to 'targeted(least-likely).'")

    def set_mode_targeted_random(self, n_classses=None):
        r"""
        Set attack mode as targeted with random labels.
        Arguments:
            num_classses (str): number of classes.
        """
        if "targeted" not in self._supported_mode:
            raise ValueError("Targeted mode is not supported.")

        self._attack_mode = "targeted(random)"
        self._targeted = True
        self._n_classses = n_classses
        self._target_map_function = self._get_random_target_label
        print("Attack mode is changed to 'targeted(random).'")

    def set_return_type(self, type):
        r"""
        Set the return type of adversarial images: `int` or `float`.
        Arguments:
            type (str): 'float' or 'int'. (Default: 'float')
        .. note::
            If 'int' is used for the return type, the file size of 
            adversarial images can be reduced (about 1/4 for CIFAR10).
            However, if the attack originally outputs float adversarial images
            (e.g. using small step-size than 1/255), it might reduce the attack
            success rate of the attack.
        """
        if type == 'float':
            self._return_type = 'float'
        elif type == 'int':
            self._return_type = 'int'
        else:
            raise ValueError(type + " is not a valid type. [Options: float, int]")

    def set_training_mode(self, model_training=False, batchnorm_training=False, dropout_training=False):
        r"""
        Set training mode during attack process.
        Arguments:
            model_training (bool): True for using training mode for the entire model during attack process.
            batchnorm_training (bool): True for using training mode for batchnorms during attack process.
            dropout_training (bool): True for using training mode for dropouts during attack process.
        .. note::
            For RNN-based models, we cannot calculate gradients with eval mode.
            Thus, it should be changed to the training mode during the attack.
        """
        self._model_training = model_training
        self._batchnorm_training = batchnorm_training
        self._dropout_training = dropout_training

    def save(self, data_loader, save_path=None, verbose=True, return_verbose=False):
        r"""
        Save adversarial images as torch.tensor from given torch.utils.data.DataLoader.
        Arguments:
            save_path (str): save_path.
            data_loader (torch.utils.data.DataLoader): data loader.
            verbose (bool): True for displaying detailed information. (Default: True)
            return_verbose (bool): True for returning detailed information. (Default: False)
        """
        if (verbose==False) and (return_verbose==True):
            raise ValueError("Verobse should be True if return_verbose==True.")
            
        if save_path is not None:
            image_list = []
            label_list = []

        correct = 0
        total = 0
        l2_distance = []

        total_batch = len(data_loader)

        given_training = self.model.training

        for step, (images, labels) in enumerate(data_loader):
            start = time.time()
            adv_images = self.__call__(images, labels)

            batch_size = len(images)

            if save_path is not None:
                image_list.append(adv_images.cpu())
                label_list.append(labels.cpu())

            if self._return_type == 'int':
                adv_images = adv_images.float()/255

            if verbose:
                with torch.no_grad():
                    if given_training:
                        self.model.eval()
                    outputs = self.model(adv_images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    right_idx = (predicted == labels.to(self.device))
                    correct += right_idx.sum()
                    end = time.time()
                    delta = (adv_images - images.to(self.device)).view(batch_size, -1)
                    l2_distance.append(torch.norm(delta[~right_idx], p=2, dim=1))

                    rob_acc = 100 * float(correct) / total
                    l2 = torch.cat(l2_distance).mean().item()
                    progress = (step+1)/total_batch*100
                    elapsed_time = end-start
                    self._save_print(progress, rob_acc, l2, elapsed_time, end='\r')

        # To avoid erasing the printed information.
        if verbose:
            self._save_print(progress, rob_acc, l2, elapsed_time, end='\n')

        if save_path is not None:
            x = torch.cat(image_list, 0)
            y = torch.cat(label_list, 0)
            torch.save((x, y), save_path)
            print('- Save complete!')

        if given_training:
            self.model.train()

        if return_verbose:
            return rob_acc, l2, elapsed_time

    def _save_print(self, progress, rob_acc, l2, elapsed_time, end):
        print('- Save progress: %2.2f %% / Robust accuracy: %2.2f %% / L2: %1.5f (%2.3f it/s) \t' \
              % (progress, rob_acc, l2, elapsed_time), end=end)

    def _get_target_label(self, images, labels=None):
        r"""
        Function for changing the attack mode.
        Return input labels.
        """
        if self._target_map_function:
            return self._target_map_function(images, labels)
        raise ValueError('Please define target_map_function.')

    def _get_least_likely_label(self, images, labels=None):
        r"""
        Function for changing the attack mode.
        Return least likely labels.
        """
        outputs = self.model(images)
        if self._kth_min < 0:
            pos = outputs.shape[1] + self._kth_min + 1
        else:
            pos = self._kth_min
        _, target_labels = torch.kthvalue(outputs.data, pos)
        target_labels = target_labels.detach()
        return target_labels.long().to(self.device)

    def _get_random_target_label(self, images, labels=None):
        if self._n_classses is None:
            outputs = self.model(images)
            if labels is None:
                _, labels = torch.max(outputs, dim=1)
            n_classses = outputs.shape[-1]
        else:
            n_classses = self._n_classses

        target_labels = torch.zeros_like(labels)
        for counter in range(labels.shape[0]):
            l = list(range(n_classses))
            l.remove(labels[counter])
            t = self.random_int(0, len(l))
            target_labels[counter] = l[t]

        return target_labels.long().to(self.device)
    
    def random_int(self, low=0, high=1, shape=[1]):
        t = low + (high - low) * torch.rand(shape).to(self.device)
        return t.long()

    def _to_uint(self, images):
        r"""
        Function for changing the return type.
        Return images as int.
        """
        return (images*255).type(torch.uint8)

    def __str__(self):
        info = self.__dict__.copy()

        del_keys = ['model', 'attack']

        for key in info.keys():
            if key[0] == "_":
                del_keys.append(key)

        for key in del_keys:
            del info[key]

        info['attack_mode'] = self._attack_mode
        info['return_type'] = self._return_type

        return self.attack + "(" + ', '.join('{}={}'.format(key, val) for key, val in info.items()) + ")"

    def __call__(self, *input, **kwargs):
        given_training = self.model.training

        if self._model_training:
            self.model.train()
            for _, m in self.model.named_modules():
                if not self._batchnorm_training:
                    if 'BatchNorm' in m.__class__.__name__:
                        m = m.eval()
                if not self._dropout_training:
                    if 'Dropout' in m.__class__.__name__:
                        m = m.eval()

        else:
            self.model.eval()

        images = self.forward(*input, **kwargs)

        if given_training:
            self.model.train()

        if self._return_type == 'int':
            images = self._to_uint(images)

        return images

In [227]:
import torch
import torch.nn as nn

class PGDCombined(AttackCombined):
    r"""
    PGD in the paper 'Towards Deep Learning Models Resistant to Adversarial Attacks'
    [https://arxiv.org/abs/1706.06083]

    Distance Measure : Linf

    Arguments:
        model (nn.Module): model to attack.
        eps (float): maximum perturbation. (Default: 0.3)
        alpha (float): step size. (Default: 2/255)
        steps (int): number of steps. (Default: 40)
        random_start (bool): using random initialization of delta. (Default: True)

    Shape:
        - images: :math:`(N, C, H, W)` where `N = number of batches`, `C = number of channels`,        `H = height` and `W = width`. It must have a range [0, 1].
        - labels: :math:`(N)` where each value :math:`y_i` is :math:`0 \leq y_i \leq` `number of labels`.
        - output: :math:`(N, C, H, W)`.

    Examples::
        >>> attack = torchattacks.PGD(model, eps=8/255, alpha=1/255, steps=40, random_start=True)
        >>> adv_images = attack(images, labels)

    """
    def __init__(
        self,
        base_model,
        detector_model,
        eps=0.3,
        alpha=2/255,
        steps=40,
        random_start=True,
    ):
        super().__init__("PGD", base_model, detector_model)
        self.eps = eps
        self.alpha = alpha
        self.steps = steps
        self.random_start = random_start
        self._supported_mode = ['default', 'targeted']

    def forward(self, images, labels):
        r"""
        Overridden.
        """
        images = images.clone().detach().to(self.device)
        labels = labels.clone().detach().to(self.device)

        if self._targeted:
            target_labels = self._get_target_label(images, labels)

        loss1 = nn.CrossEntropyLoss()
        loss2 = nn.BCEWithLogitsLoss()

        adv_images = images.clone().detach()

        if self.random_start:
            # Starting at a uniformly random point
            adv_images = adv_images + torch.empty_like(adv_images).uniform_(-self.eps, self.eps)
            adv_images = torch.clamp(adv_images, min=0, max=1).detach()

        for _ in range(self.steps):
            adv_images.requires_grad = True
            outputs1 = self.model(adv_images.float())
            interm_outputs = self.model(adv_images.float(), return_interm_layer=interm_layer)
            outputs2 = self.detector(interm_outputs)

            discount_factor = 0.7

            base_classifier_loss = loss1(outputs1, labels)
            subnetwork_loss = loss2(outputs2, torch.ones(labels.shape[0], 1))

            # Calculate loss
            if self._targeted:
                # cost = -loss(outputs, target_labels)
                cost = -base_classifier_loss + -subnetwork_loss
            else:
                # cost = loss(outputs, labels)
                cost = base_classifier_loss + subnetwork_loss
                # cost = subnetwork_loss

            print(f"cost: {cost} | base classifier loss: {base_classifier_loss} | subnet loss: {subnetwork_loss}")

            # Update adversarial images
            grad = torch.autograd.grad(cost, adv_images,
                                       retain_graph=False, create_graph=False)[0]

            adv_images = adv_images.detach() + self.alpha*grad.sign()
            delta = torch.clamp(adv_images - images, min=-self.eps, max=self.eps)
            adv_images = torch.clamp(images + delta, min=0, max=1).detach()

        return adv_images

In [228]:
class SingleAttack(object):
    r"""
    Base class for all attacks.
    .. note::
        It automatically set device to the device where given model is.
        It basically changes training mode to eval during attack process.
        To change this, please see `set_training_mode`.
    """
    def __init__(self, name, model):
        r"""
        Initializes internal attack state.
        Arguments:
            name (str): name of attack.
            model (torch.nn.Module): model to attack.
        """

        self.attack = name
        self.model = model
        self.model_name = str(model).split("(")[0]
        self.device = next(model.parameters()).device

        self._attack_mode = 'default'
        self._targeted = False
        self._return_type = 'float'
        self._supported_mode = ['default']

        self._model_training = False
        self._batchnorm_training = False
        self._dropout_training = False

    def forward(self, *input):
        r"""
        It defines the computation performed at every call.
        Should be overridden by all subclasses.
        """
        raise NotImplementedError

    def get_mode(self):
        r"""
        Get attack mode.
        """
        return self._attack_mode

    def set_mode_default(self):
        r"""
        Set attack mode as default mode.
        """
        self._attack_mode = 'default'
        self._targeted = False
        print("Attack mode is changed to 'default.'")

    def set_mode_targeted_by_function(self, target_map_function=None):
        r"""
        Set attack mode as targeted.
        Arguments:
            target_map_function (function): Label mapping function.
                e.g. lambda images, labels:(labels+1)%10.
                None for using input labels as targeted labels. (Default)
        """
        if "targeted" not in self._supported_mode:
            raise ValueError("Targeted mode is not supported.")

        self._attack_mode = 'targeted'
        self._targeted = True
        self._target_map_function = target_map_function
        print("Attack mode is changed to 'targeted.'")

    def set_mode_targeted_least_likely(self, kth_min=1):
        r"""
        Set attack mode as targeted with least likely labels.
        Arguments:
            kth_min (str): label with the k-th smallest probability used as target labels. (Default: 1)
        """
        if "targeted" not in self._supported_mode:
            raise ValueError("Targeted mode is not supported.")

        self._attack_mode = "targeted(least-likely)"
        self._targeted = True
        self._kth_min = kth_min
        self._target_map_function = self._get_least_likely_label
        print("Attack mode is changed to 'targeted(least-likely).'")

    def set_mode_targeted_random(self, n_classses=None):
        r"""
        Set attack mode as targeted with random labels.
        Arguments:
            num_classses (str): number of classes.
        """
        if "targeted" not in self._supported_mode:
            raise ValueError("Targeted mode is not supported.")

        self._attack_mode = "targeted(random)"
        self._targeted = True
        self._n_classses = n_classses
        self._target_map_function = self._get_random_target_label
        print("Attack mode is changed to 'targeted(random).'")

    def set_return_type(self, type):
        r"""
        Set the return type of adversarial images: `int` or `float`.
        Arguments:
            type (str): 'float' or 'int'. (Default: 'float')
        .. note::
            If 'int' is used for the return type, the file size of 
            adversarial images can be reduced (about 1/4 for CIFAR10).
            However, if the attack originally outputs float adversarial images
            (e.g. using small step-size than 1/255), it might reduce the attack
            success rate of the attack.
        """
        if type == 'float':
            self._return_type = 'float'
        elif type == 'int':
            self._return_type = 'int'
        else:
            raise ValueError(type + " is not a valid type. [Options: float, int]")

    def set_training_mode(self, model_training=False, batchnorm_training=False, dropout_training=False):
        r"""
        Set training mode during attack process.
        Arguments:
            model_training (bool): True for using training mode for the entire model during attack process.
            batchnorm_training (bool): True for using training mode for batchnorms during attack process.
            dropout_training (bool): True for using training mode for dropouts during attack process.
        .. note::
            For RNN-based models, we cannot calculate gradients with eval mode.
            Thus, it should be changed to the training mode during the attack.
        """
        self._model_training = model_training
        self._batchnorm_training = batchnorm_training
        self._dropout_training = dropout_training

    def save(self, data_loader, save_path=None, verbose=True, return_verbose=False):
        r"""
        Save adversarial images as torch.tensor from given torch.utils.data.DataLoader.
        Arguments:
            save_path (str): save_path.
            data_loader (torch.utils.data.DataLoader): data loader.
            verbose (bool): True for displaying detailed information. (Default: True)
            return_verbose (bool): True for returning detailed information. (Default: False)
        """
        if (verbose==False) and (return_verbose==True):
            raise ValueError("Verobse should be True if return_verbose==True.")
            
        if save_path is not None:
            image_list = []
            label_list = []

        correct = 0
        total = 0
        l2_distance = []

        total_batch = len(data_loader)

        given_training = self.model.training

        for step, (images, labels) in enumerate(data_loader):
            start = time.time()
            adv_images = self.__call__(images, labels)

            batch_size = len(images)

            if save_path is not None:
                image_list.append(adv_images.cpu())
                label_list.append(labels.cpu())

            if self._return_type == 'int':
                adv_images = adv_images.float()/255

            if verbose:
                with torch.no_grad():
                    if given_training:
                        self.model.eval()
                    outputs = self.model(adv_images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    right_idx = (predicted == labels.to(self.device))
                    correct += right_idx.sum()
                    end = time.time()
                    delta = (adv_images - images.to(self.device)).view(batch_size, -1)
                    l2_distance.append(torch.norm(delta[~right_idx], p=2, dim=1))

                    rob_acc = 100 * float(correct) / total
                    l2 = torch.cat(l2_distance).mean().item()
                    progress = (step+1)/total_batch*100
                    elapsed_time = end-start
                    self._save_print(progress, rob_acc, l2, elapsed_time, end='\r')

        # To avoid erasing the printed information.
        if verbose:
            self._save_print(progress, rob_acc, l2, elapsed_time, end='\n')

        if save_path is not None:
            x = torch.cat(image_list, 0)
            y = torch.cat(label_list, 0)
            torch.save((x, y), save_path)
            print('- Save complete!')

        if given_training:
            self.model.train()

        if return_verbose:
            return rob_acc, l2, elapsed_time

    def _save_print(self, progress, rob_acc, l2, elapsed_time, end):
        print('- Save progress: %2.2f %% / Robust accuracy: %2.2f %% / L2: %1.5f (%2.3f it/s) \t' \
              % (progress, rob_acc, l2, elapsed_time), end=end)

    def _get_target_label(self, images, labels=None):
        r"""
        Function for changing the attack mode.
        Return input labels.
        """
        if self._target_map_function:
            return self._target_map_function(images, labels)
        raise ValueError('Please define target_map_function.')

    def _get_least_likely_label(self, images, labels=None):
        r"""
        Function for changing the attack mode.
        Return least likely labels.
        """
        outputs = self.model(images)
        if self._kth_min < 0:
            pos = outputs.shape[1] + self._kth_min + 1
        else:
            pos = self._kth_min
        _, target_labels = torch.kthvalue(outputs.data, pos)
        target_labels = target_labels.detach()
        return target_labels.long().to(self.device)

    def _get_random_target_label(self, images, labels=None):
        if self._n_classses is None:
            outputs = self.model(images)
            if labels is None:
                _, labels = torch.max(outputs, dim=1)
            n_classses = outputs.shape[-1]
        else:
            n_classses = self._n_classses

        target_labels = torch.zeros_like(labels)
        for counter in range(labels.shape[0]):
            l = list(range(n_classses))
            l.remove(labels[counter])
            t = self.random_int(0, len(l))
            target_labels[counter] = l[t]

        return target_labels.long().to(self.device)
    
    def random_int(self, low=0, high=1, shape=[1]):
        t = low + (high - low) * torch.rand(shape).to(self.device)
        return t.long()

    def _to_uint(self, images):
        r"""
        Function for changing the return type.
        Return images as int.
        """
        return (images*255).type(torch.uint8)

    def __str__(self):
        info = self.__dict__.copy()

        del_keys = ['model', 'attack']

        for key in info.keys():
            if key[0] == "_":
                del_keys.append(key)

        for key in del_keys:
            del info[key]

        info['attack_mode'] = self._attack_mode
        info['return_type'] = self._return_type

        return self.attack + "(" + ', '.join('{}={}'.format(key, val) for key, val in info.items()) + ")"

    def __call__(self, *input, **kwargs):
        given_training = self.model.training

        if self._model_training:
            self.model.train()
            for _, m in self.model.named_modules():
                if not self._batchnorm_training:
                    if 'BatchNorm' in m.__class__.__name__:
                        m = m.eval()
                if not self._dropout_training:
                    if 'Dropout' in m.__class__.__name__:
                        m = m.eval()

        else:
            self.model.eval()

        images = self.forward(*input, **kwargs)

        if given_training:
            self.model.train()

        if self._return_type == 'int':
            images = self._to_uint(images)

        return images

class SinglePGD(SingleAttack):
    r"""
    PGD in the paper 'Towards Deep Learning Models Resistant to Adversarial Attacks'
    [https://arxiv.org/abs/1706.06083]

    Distance Measure : Linf

    Arguments:
        model (nn.Module): model to attack.
        eps (float): maximum perturbation. (Default: 0.3)
        alpha (float): step size. (Default: 2/255)
        steps (int): number of steps. (Default: 40)
        random_start (bool): using random initialization of delta. (Default: True)

    Shape:
        - images: :math:`(N, C, H, W)` where `N = number of batches`, `C = number of channels`,        `H = height` and `W = width`. It must have a range [0, 1].
        - labels: :math:`(N)` where each value :math:`y_i` is :math:`0 \leq y_i \leq` `number of labels`.
        - output: :math:`(N, C, H, W)`.

    Examples::
        >>> attack = torchattacks.PGD(model, eps=8/255, alpha=1/255, steps=40, random_start=True)
        >>> adv_images = attack(images, labels)

    """
    def __init__(
        self,
        model,
        eps=0.3,
        alpha=2/255,
        steps=40,
        random_start=True,
    ):
        super().__init__("PGD", model)
        self.eps = eps
        self.alpha = alpha
        self.steps = steps
        self.random_start = random_start
        self._supported_mode = ['default', 'targeted']

    def forward(self, images, labels):
        r"""
        Overridden.
        """
        images = images.clone().detach().to(self.device)
        labels = labels.clone().detach().to(self.device)

        if self._targeted:
            target_labels = self._get_target_label(images, labels)

        # loss1 = nn.CrossEntropyLoss()
        loss = nn.BCEWithLogitsLoss()

        adv_images = images.clone().detach()

        if self.random_start:
            # Starting at a uniformly random point
            adv_images = adv_images + torch.empty_like(adv_images).uniform_(-self.eps, self.eps)
            adv_images = torch.clamp(adv_images, min=0, max=1).detach()

        for _ in range(self.steps):
            adv_images.requires_grad = True
            outputs = self.model(adv_images)

            # Calculate loss
            if self._targeted:
                cost = -loss(outputs, target_labels)
            else:
                cost = loss(outputs, labels)

            print(f"Single Attack on SubNet Cost: {cost}")

            # Update adversarial images
            grad = torch.autograd.grad(cost, adv_images,
                                       retain_graph=False, create_graph=False)[0]

            adv_images = adv_images.detach() + self.alpha*grad.sign()
            delta = torch.clamp(adv_images - images, min=-self.eps, max=self.eps)
            adv_images = torch.clamp(images + delta, min=0, max=1).detach()

        return adv_images

In [232]:
combined_attack = PGDCombined(resnet_model.to('cpu'), subnet_model.to('cpu'), eps=8/255, alpha=8/(255*10), steps=100)
# combined_attack = SinglePGD(subnet_model.to('cpu'), eps=4/255, alpha=4/(255*40), steps=100)

In [233]:
adv_inputs = combined_attack(
    torch.from_numpy(x_test[:10]).permute(0, 3, 1, 2),
    torch.from_numpy(y_test[:10])
)

cost: 0.5153104066848755 | base classifier loss: 0.03767472505569458 | subnet loss: 0.4776357114315033
cost: 0.550883948802948 | base classifier loss: 0.0731065571308136 | subnet loss: 0.4777773916721344
cost: 0.6176415681838989 | base classifier loss: 0.13970905542373657 | subnet loss: 0.47793254256248474
cost: 0.7362610697746277 | base classifier loss: 0.2581440806388855 | subnet loss: 0.4781169891357422
cost: 0.9072962999343872 | base classifier loss: 0.4289325177669525 | subnet loss: 0.4783638119697571
cost: 1.1168028116226196 | base classifier loss: 0.6381542682647705 | subnet loss: 0.4786485731601715
cost: 1.337699055671692 | base classifier loss: 0.8587648272514343 | subnet loss: 0.47893422842025757
cost: 1.5390058755874634 | base classifier loss: 1.0598077774047852 | subnet loss: 0.47919806838035583
cost: 1.7103347778320312 | base classifier loss: 1.2308595180511475 | subnet loss: 0.47947531938552856
cost: 1.841158151626587 | base classifier loss: 1.361417531967163 | subnet los

In [None]:
adv_inputs.shape

In [None]:
# plt.imshow(adv_inputs[0].squeeze(0))
plt.imshow(adv_inputs[0].permute(1, 2, 0))

In [None]:
plt.imshow(x_test[0])

In [134]:
torch.argmax(resnet_model(adv_inputs[0].unsqueeze(0).float()))

tensor(5)

In [135]:
torch.argmax(resnet_model(torch.from_numpy(x_test[0]).permute(2, 0, 1).unsqueeze(0).float()))

tensor(3)

In [147]:
subnet_model(resnet_model(adv_inputs[1].unsqueeze(0).float(), return_interm_layer=2))

tensor([[0.4482]], grad_fn=<SigmoidBackward0>)

In [140]:
subnet_model(resnet_model(torch.from_numpy(x_test[1]).permute(2, 0, 1).unsqueeze(0).float(), return_interm_layer=2))

tensor([[0.4460]], grad_fn=<SigmoidBackward0>)