# CIFAR10: Training and Testing on a Clean Dataset & Adversarial Detection

## Imports and CIFAR10 loading

In [1]:
# Imports all the module paths
import sys

import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from tqdm.notebook import tnrange, tqdm

sys.path.append("../../")

# Loads the rest of the modules

# For testing
import utils.clean_test as clean_test

# Contains the data loadders
import utils.dataloaders as dataloaders

# For printing outcomes
# import utils.printing as printing

# Example printing, but I removed it to simplify results
# for epsilon in epsilons:
#     printing.print_attack(
#         model,
#         testSetLoader,
#         "FGSM",
#         attacks["FGSM"],
#         epsilon=epsilon,
#     )

In [2]:
# Define the `device` PyTorch will be running on, please hope it is CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Notebook will use PyTorch Device: " + device.upper())

Notebook will use PyTorch Device: CUDA


## Load the dataset

In [3]:
DATA_ROOT = "../../datasets/CIFAR10/"

trainSetLoader, _, testSetLoader = dataloaders.get_CIFAR10_data_loaders(
    DATA_ROOT,
    trainSetSize=50000,
    validationSetSize=0,
    batchSize=32,
)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


## Attacks and Their Results

In [4]:
# A possible attacks array (for nice printing):
# Some attacks use a helper library
import torchattacks

import attacks.fgsm as fgsm
import attacks.ifgsm as ifgsm
import attacks.pgd as pgd
import utils.attacking as attacking

loss_function = nn.CrossEntropyLoss()
attacks = {}

attacks["FGSM"] = fgsm.fgsm_attack
attacks["I-FGSM"] = ifgsm.ifgsm_attack
attacks["PGD"] = pgd.pgd_attack

## Load two models (standard and FGSM trained)

In [5]:
standard_model = torch.load("../../models_data/CIFAR10/cifar10_standard")
standard_model.eval()

pgd_model = torch.load("../../models_data/CIFAR10/cifar10_pgd")
pgd_model.eval()

DataParallel(
  (module): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (layer1): Sequential(
      (0): ResNetBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (shortcut): Sequential()
      )
      (1): ResNetBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     

In [6]:
# Evaluate the two models
# clean_test.test_trained_model(standard_model, testSetLoader)
# clean_test.test_trained_model(pgd_model, testSetLoader)

In [7]:
# A possible attacks array (for nice printing):
# Some attacks use a helper library
import torchattacks

import attacks.fgsm as fgsm
import attacks.ifgsm as ifgsm
import attacks.pgd as pgd
import utils.attacking as attacking

attacks = {}

attacks["FGSM"] = fgsm.fgsm_attack
attacks["I-FGSM"] = ifgsm.ifgsm_attack
attacks["PGD"] = pgd.pgd_attack

## Classification score approach for detecting adversarial example in deep neural network
https://link.springer.com/article/10.1007/s11042-020-09167-z

## Standard Model

In [8]:
# Here you decide on the threshold for the clean dataset
threshold = 0.1

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Predict
    with torch.no_grad():
        logits = standard_model(images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]


print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

... done! Rejected 21, Accepted 9979, Accuracy: 95.37027758292415%


In [9]:
# Here you decide on the threshold for the clean dataset
threshold = 0.4

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        standard_model,
        loss_function,
        epsilon=0.35,
        alpha=None,
        scale=True,
        iterations=None,
    )

    # Predict
    with torch.no_grad():
        logits = standard_model(perturbed_images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]

print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

... done! Rejected 437, Accepted 9563, Accuracy: 9.09756352609014%


## PGD Model

In [10]:
# Here you decide on the threshold for the clean dataset
threshold = 0.1

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Predict
    with torch.no_grad():
        logits = pgd_model(images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]

print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

... done! Rejected 95, Accepted 9905, Accuracy: 85.13881877839475%


In [11]:
# Here you decide on the threshold for the clean dataset
threshold = 0.25

# Rejected
accepted = 0
rejected = 0
correct = 0

# Use a pretty progress bar to show updates
for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.35,
        alpha=None,
        scale=True,
        iterations=None,
    )

    # Predict
    with torch.no_grad():
        logits = pgd_model(perturbed_images)

    # The highest class represents the chosen class (input, k, dimension)
    _, preds = torch.topk(logits, 2, 1)

    # Check each image and see if it is adversarial
    for index in range(len(images)):
        max_index = preds[index][0]
        sec_index = preds[index][1]

        diff = logits[index][max_index] - logits[index][sec_index]

        if diff < threshold:
            rejected += 1
        else:
            accepted += 1
            correct += max_index == labels[index]

print(
    "... done! Rejected {}, Accepted {}, Accuracy: {}%".format(
        rejected, accepted, float(correct) * 100 / accepted
    )
)

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

... done! Rejected 818, Accepted 9182, Accuracy: 22.936179481594422%


## Maximum Mean Discrepancy Test is Aware of Adversarial Attacks
https://arxiv.org/abs/2010.11415

In [12]:
# Not implemented due to breaking changes

In [13]:
# Firstly extract semantic features from trained model
# Note: we just need the penultimate layer (so not complete pain)
# Train a model that also returns penultimate layer

## A Simple Unified Framework for Detecting Out-of-Distribution Samples and Adversarial Attacks
https://arxiv.org/abs/1807.03888

## Characterizing Adversarial Subspaces Using Local Intrinsic Dimensionality
https://arxiv.org/abs/1801.02613

## PCA Detection

In [24]:
from sklearn.decomposition import PCA

# Copy the MNIST data and then fit using PCA
# First convert to numpy arrays (and make it float)
numpyTrainingData = trainSetLoader.dataset.data.astype("float32")
# Note you also need to reshape the input data for your sanity
reshapedNumpyTrainingData = numpyTrainingData.reshape(
    (len(numpyTrainingData), 32 * 32 * 3)
)

# Then perform PCA on training data to get principal components
# Note it should reflect dimension of image, i.e. 28 * 28
pca = PCA(n_components=32 * 32 * 3).fit(reshapedNumpyTrainingData)

In [25]:
# Now on clean data check if there are any adversarial samples
numpyTestData = testSetLoader.dataset.data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = standard_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = standard_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
0


In [26]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.75,
        alpha=None,
        scale=True,
        iterations=None,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = standard_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = standard_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
9398


In [27]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.01,
        alpha=None,
        scale=True,
        iterations=None,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = standard_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = standard_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
9002


In [28]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.25,
        alpha=None,
        scale=True,
        iterations=None,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = standard_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = standard_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
8962


In [29]:
cw_attack = torchattacks.CW(standard_model, c=1, steps=100)

In [30]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = cw_attack(
        images,
        labels,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = standard_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = standard_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
9267


In [31]:
cw_attack = torchattacks.CW(standard_model, c=1, steps=500)

In [32]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = cw_attack(
        images,
        labels,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = standard_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = standard_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
9276


In [33]:
cw_attack = torchattacks.CW(standard_model, c=5, steps=500)

In [34]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = cw_attack(
        images,
        labels,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = standard_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = standard_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
9264


## Applying PCA on a trained PGD model

In [17]:
from sklearn.decomposition import PCA

# Copy the MNIST data and then fit using PCA
# First convert to numpy arrays (and make it float)
numpyTrainingData = trainSetLoader.dataset.data.astype("float32")
# Note you also need to reshape the input data for your sanity
reshapedNumpyTrainingData = numpyTrainingData.reshape(
    (len(numpyTrainingData), 32 * 32 * 3)
)

# Then perform PCA on training data to get principal components
# Note it should reflect dimension of image, i.e. 28 * 28
pca = PCA(n_components=32 * 32 * 3).fit(reshapedNumpyTrainingData)

In [18]:
# Now on clean data check if there are any adversarial samples
numpyTestData = testSetLoader.dataset.data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = pgd_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = pgd_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
0


In [19]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = fgsm.fgsm_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.75,
        alpha=None,
        scale=True,
        iterations=None,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = pgd_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = pgd_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
10000


In [20]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = pgd.pgd_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.75,
        alpha=(2 / 255),
        scale=True,
        iterations=20,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = pgd_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = pgd_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
10000


In [21]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = pgd.pgd_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.15,
        alpha=(2 / 255),
        scale=True,
        iterations=20,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = pgd_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = pgd_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 15
Trial 16
Trial 17
Trial 18
Trial 19
Trial 20
Trial 21
Trial 22
Trial 23
Trial 24
10000


In [23]:
# Now do the same on adversarial data check if there are any adversarial samples
# Use a pretty progress bar to show updates
data = []

for j, (images, labels) in enumerate(
    tqdm(testSetLoader, desc="Testing Progress", leave=False)
):
    # Cast to proper tensor
    images, labels = images.to(device), labels.to(device)

    # Perturb the images using the attack
    perturbed_images = pgd.pgd_attack(
        images,
        labels,
        pgd_model,
        loss_function,
        epsilon=0.05,
        alpha=(2 / 255),
        scale=True,
        iterations=20,
    )

    for perturbed_image in perturbed_images:
        data.append(perturbed_image.detach().cpu().numpy())

data = np.asarray(data)
numpyTestData = data.astype("float32")
reshapedNumpyTestData = numpyTestData.reshape((len(numpyTestData), 32 * 32 * 3))

# Original predictions on data
predictions_base = np.zeros((len(numpyTestData), ))

print("Original prediction...")
for index in range(len(numpyTestData)):
    testTensor = torch.from_numpy(np.reshape(numpyTestData[index], (1, 3, 32, 32))).to(
        device
    )

    with torch.no_grad():
        logits = pgd_model(testTensor).detach().cpu().numpy()

    predictions_base[index] = np.argmax(logits)
print("Done")

# Transform clean data along principal components
transformedTestData = pca.transform(reshapedNumpyTestData)

# Decides how many of the least significant coefficients (of components) to perturb
num_components = 1000

# How many trials to run
num_trials = 25

# Track results
result = np.zeros(len(numpyTestData), dtype=int)

# Actual attempts
for trial in range(num_trials):
    print("Trial {}".format(trial))
    random_noise = np.random.standard_normal(size=num_components)

    # Copy the data
    transformedTestDataNoisy = np.copy(transformedTestData)

    # Update the components with the right data
    for index in range(len(numpyTestData)):
        transformedTestDataNoisy[index][(32 * 32 * 3 - num_components) :] += (
            10 * random_noise
        )

    # Now calculate the inverse using PCA and the noise
    inverseTestDataNoisy = pca.inverse_transform(transformedTestDataNoisy)

    # Reshape into image
    testDataNoisy = np.reshape(inverseTestDataNoisy, (len(numpyTestData), 3, 32, 32))

    # Modified predictions on data
    predictions_modified = np.zeros((len(numpyTestData), ))

    for index in range(len(testDataNoisy)):
        testTensor = torch.from_numpy(
            np.reshape(testDataNoisy[index], (1, 3, 32, 32))
        ).to(device)

        with torch.no_grad():
            logits = pgd_model(testTensor).detach().cpu().numpy()

        predictions_modified[index] = np.argmax(logits)

    check = np.not_equal(predictions_modified, predictions_base)
    result = np.logical_or(check, result)

# Printing
print(np.sum(result))

Testing Progress:   0%|          | 0/313 [00:00<?, ?it/s]

Original prediction...
Done
Trial 0
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7


KeyboardInterrupt: 