# MNIST: Training and Testing on a Clean Dataset & Adversarial Detection

## Imports and MNIST loading

In [1]:
# Imports all the module paths
import sys

import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from tqdm.notebook import tnrange, tqdm

sys.path.append("../../")

# Loads the rest of the modules

# File containing all the required training methods
import defences.mnist as defences

# For testing
import utils.clean_test as clean_test

# Contains the data loadders
import utils.dataloaders as dataloaders

# For printing outcomes
# import utils.printing as printing

# Example printing, but I removed it to simplify results
# for epsilon in epsilons:
#     printing.print_attack(
#         model,
#         testSetLoader,
#         "FGSM",
#         attacks["FGSM"],
#         epsilon=epsilon,
#     )

Notebook will use PyTorch Device: CUDA
Notebook will use PyTorch Device: CUDA
Notebook will use PyTorch Device: CUDA
Notebook will use PyTorch Device: CUDA


In [2]:
# Define the `device` PyTorch will be running on, please hope it is CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Notebook will use PyTorch Device: " + device.upper())

Notebook will use PyTorch Device: CUDA


## Load the dataset

In [3]:
DATA_ROOT = "../../datasets/"

trainSetLoader, _, testSetLoader = dataloaders.get_MNIST_data_loaders(
    DATA_ROOT,
    trainSetSize=50000,
    validationSetSize=0,
    batchSize=128,
)

## Attacks and Their Results

In [4]:
# A possible attacks array (for nice printing):
# Some attacks use a helper library
import torchattacks

import attacks.fgsm as fgsm
import attacks.ifgsm as ifgsm
import attacks.pgd as pgd
import utils.attacking as attacking

loss_function = nn.CrossEntropyLoss()
attacks = {}

attacks["FGSM"] = fgsm.fgsm_attack
attacks["I-FGSM"] = ifgsm.ifgsm_attack
attacks["PGD"] = pgd.pgd_attack

## Load two models (standard and FGSM trained)

In [26]:
standard_model = torch.load("../../models_data/MNIST/mnist_standard")
standard_model.eval()

pgd_model = torch.load("../../models_data/MNIST/mnist_pgd")
pgd_model.eval()

LeNet5(
  (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (max_pool_1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (max_pool_2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=10, bias=True)
)

In [27]:
# Evaluate the two models
clean_test.test_trained_model(standard_model, testSetLoader)
clean_test.test_trained_model(pgd_model, testSetLoader)

Testing the model...


Testing Progress:   0%|          | 0/79 [00:00<?, ?it/s]

... done! Accuracy: 99.14%
Testing the model...


Testing Progress:   0%|          | 0/79 [00:00<?, ?it/s]

... done! Accuracy: 99.27%


In [28]:
SAVE_LOAD_ROOT = "../../models_data/MNIST"

model = defences.standard_training(
    trainSetLoader,
    load_if_available=True,
    load_path=SAVE_LOAD_ROOT + "/mnist_standard_with_feature_list",
)

Found already trained model...
... loaded!


In [29]:
# Test the model
clean_test.test_trained_model(model, testSetLoader)

Testing the model...


Testing Progress:   0%|          | 0/79 [00:00<?, ?it/s]

... done! Accuracy: 99.23%


In [30]:
# Save the model
torch.save(model, SAVE_LOAD_ROOT + "/mnist_standard_with_feature_list")

In [31]:
# A possible attacks array (for nice printing):
# Some attacks use a helper library
import torchattacks

import attacks.fgsm as fgsm
import attacks.ifgsm as ifgsm
import attacks.pgd as pgd
import utils.attacking as attacking

attacks = {}

attacks["FGSM"] = fgsm.fgsm_attack
attacks["I-FGSM"] = ifgsm.ifgsm_attack
attacks["PGD"] = pgd.pgd_attack

## Classification score approach for detecting adversarial example in deep neural network
https://link.springer.com/article/10.1007/s11042-020-09167-z

## Standard Model

In [32]:
# Here you decide on the threshold for the clean dataset
threshold = 0.1

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Predict
    logits = standard_model(images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]


print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/79 [00:00<?, ?it/s]

... done! Rejected 1, Accepted 9999, Accuracy: 99.14991499149914%


In [39]:
# Here you decide on the threshold for the clean dataset
threshold = 0.4

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        standard_model,
        loss_function,
        epsilon=0.35,
        alpha=None,
        scale=True,
        iterations=None,
    )

    # Predict
    logits = standard_model(perturbed_images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]

print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/79 [00:00<?, ?it/s]

... done! Rejected 791, Accepted 9209, Accuracy: 4.104680204148116%


## PGD Model

In [36]:
# Here you decide on the threshold for the clean dataset
threshold = 0.1

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Predict
    logits = pgd_model(images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]

print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/79 [00:00<?, ?it/s]

... done! Rejected 6, Accepted 9994, Accuracy: 99.28957374424655%


In [40]:
# Here you decide on the threshold for the clean dataset
threshold = 0.25

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        standard_model,
        loss_function,
        epsilon=0.35,
        alpha=None,
        scale=True,
        iterations=None,
    )

    # Predict
    logits = pgd_model(perturbed_images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]

print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/79 [00:00<?, ?it/s]

... done! Rejected 822, Accepted 9178, Accuracy: 29.58160819350621%


## Maximum Mean Discrepancy Test is Aware of Adversarial Attacks
https://arxiv.org/abs/2010.11415

In [42]:
# Not implemented due to breaking changes

In [41]:
# Firstly extract semantic features from trained model
# Note: we just need the penultimate layer (so not complete pain)
# Train a model that also returns penultimate layer

## A Simple Unified Framework for Detecting Out-of-Distribution Samples and Adversarial Attacks
https://arxiv.org/abs/1807.03888

## Characterizing Adversarial Subspaces Using Local Intrinsic Dimensionality
https://arxiv.org/abs/1801.02613

In [None]:
# Collect natural images
Ind_tr = np.random.choice(len(data_all), N1, replace=False)
Ind_te = np.delete(Ind_all, Ind_tr)
train_data = []
for i in Ind_tr:
train_data.append([data_all[i], label_all[i]])

dataloader = torch.utils.data.DataLoader(
train_data,
batch_size=opt.batch_size,
shuffle=True,
)

# Collect adv images
np.random.seed(seed=819 * (kk + 9) + N1)
Ind_tr_v4 = np.random.choice(len(data_trans), N1, replace=False)
Ind_te_v4 = np.delete(Ind_v4_all, Ind_tr_v4)
New_CIFAR_tr = data_trans[Ind_tr_v4]
New_CIFAR_te = data_trans[Ind_te_v4]

# Initialize optimizers
# Fetch training data
s1 = data_all[Ind_tr]
s2 = data_trans[Ind_tr_v4]
S = torch.cat([s1.cpu(), s2.cpu()], 0).cuda()
Sv = S.view(2 * N1, -1)

s1 = data_org[Ind_tr]
s2 = data_trans_org[Ind_tr_v4]
S = torch.cat([s1.cpu(), s2.cpu()], 0).cuda()
S_FEA = S.view(2 * N1, -1)

# Train SAMMD

np.random.seed(seed=1102)
torch.manual_seed(1102)
torch.cuda.manual_seed(1102)
Dxy = Pdist2(Sv[:N1, :], Sv[N1:, :])
Dxy_org = Pdist2(S_FEA[:N1, :], S_FEA[N1:, :])
epsilonOPT = torch.log(MatConvert(np.random.rand(1) * 10 ** (-10), device, dtype))
epsilonOPT.requires_grad = True
sigma0 = Dxy.median()
sigma0.requires_grad = True
sigmaOPT = MatConvert(np.ones(1) * np.sqrt(2 * 32 * 32), device, dtype)
sigmaOPT.requires_grad = True


optimizer_sigma0 = torch.optim.Adam([sigma0]+[sigmaOPT]+[epsilonOPT], lr=0.0002)
for t in range(opt.n_epochs):
ep = torch.exp(epsilonOPT) / (1 + torch.exp(epsilonOPT))
sigma = sigmaOPT ** 2
TEMPa = MMDu(Sv, N1, S_FEA, sigma, sigma0, ep, is_smooth=True)
mmd_value_tempa = -1 * (TEMPa[0] + 10 ** (-8))
mmd_std_tempa = torch.sqrt(TEMPa[1] + 10 ** (-8))
STAT_adaptive = torch.div(mmd_value_tempa, mmd_std_tempa)
optimizer_sigma0.zero_grad()
STAT_adaptive.backward(retain_graph=True)
optimizer_sigma0.step()
if t % 100 == 0:
    print("mmd: ", -1 * mmd_value_tempa.item(), "mmd_std: ", mmd_std_tempa.item(), "Statistic: ",
          -1 * STAT_adaptive.item())
Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

# Compute test power of MMD-D and baselines

H_adaptive = np.zeros(N)
T_adaptive = np.zeros(N)
M_adaptive = np.zeros(N)

np.random.seed(1102)
count_adp = 0

for k in range(N):
# Fetch test data
np.random.seed(seed=1102 * (k + 1) + N1)
data_all_te = data_all[Ind_te]
N_te = len(data_trans)-N1
Ind_N_te = np.random.choice(len(Ind_te), N_te, replace=False)#9900
s1 = data_all_te[Ind_N_te]
s2 = data_trans[Ind_te_v4]
S = torch.cat([s1.cpu(), s2.cpu()], 0).cuda()
Sv = S.view(2 * N_te, -1)

data_all_te = data_org[Ind_te]
s1 = data_all_te[Ind_N_te]
s2 = data_trans_org[Ind_te_v4]
S = torch.cat([s1.cpu(), s2.cpu()], 0).cuda()
S_FEA = S.view(2 * N_te, -1)

h_adaptive, threshold_adaptive, mmd_value_adaptive = SAMMD_WB(Sv, N_per, N_te, S_FEA, sigma, sigma0, ep, alpha, device, dtype)

# Gather results

count_adp = count_adp + h_adaptive

print("SAMMD:", count_adp)

H_adaptive[k] = h_adaptive
T_adaptive[k] = threshold_adaptive
M_adaptive[k] = mmd_value_adaptive