diff --git a/README.rst b/README.rst index badb755a..aef389b5 100644 --- a/README.rst +++ b/README.rst @@ -26,19 +26,16 @@ Installation .. code-block:: bash - # Foolbox 1.8 + # Foolbox 2.0 pip install foolbox - - # Foolbox 2.0 release candidate - pip install foolbox --pre -Foolbox requires Python 3.5 or newer (since Foolbox 2.0). +Foolbox 2.0 requires Python 3.5 or newer. Documentation ------------- Documentation for the `latest stable version `_ as well as -`pre-release versions `_ is available on ReadTheDocs. +`pre-release versions `_ is available on ReadTheDocs. Our paper describing Foolbox is on arXiv: https://arxiv.org/abs/1707.04131 @@ -48,24 +45,55 @@ Example .. code-block:: python import foolbox - import keras import numpy as np - from keras.applications.resnet50 import ResNet50 + import torchvision.models as models - # instantiate model - keras.backend.set_learning_phase(0) - kmodel = ResNet50(weights='imagenet') - preprocessing = (np.array([104, 116, 123]), 1) - fmodel = foolbox.models.KerasModel(kmodel, bounds=(0, 255), preprocessing=preprocessing) + # instantiate model (supports PyTorch, Keras, TensorFlow (Graph and Eager), MXNet and many more) + model = models.resnet18(pretrained=True).eval() + preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3) + fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1), num_classes=1000, preprocessing=preprocessing) - # get source image and label - image, label = foolbox.utils.imagenet_example() + # get a batch of images and labels and print the accuracy + images, labels = foolbox.utils.samples(dataset='imagenet', batchsize=16, data_format='channels_first', bounds=(0, 1)) + print(np.mean(fmodel.forward(images).argmax(axis=-1) == labels)) + # -> 0.9375 - # apply attack on source image - # ::-1 reverses the color channels, because Keras ResNet50 expects BGR instead of RGB + # apply the attack attack = foolbox.attacks.FGSM(fmodel) - adversarial = attack(image[:, :, ::-1], label) - # if the attack fails, adversarial will be None and a warning will be printed + adversarials = attack(images, labels) + # if the i'th image is misclassfied without a perturbation, then adversarials[i] will be the same as images[i] + # if the attack fails to find an adversarial for the i'th image, then adversarials[i] will all be np.nan + + # Foolbox guarantees that all returned adversarials are in fact in adversarials + print(np.mean(fmodel.forward(adversarials).argmax(axis=-1) == labels)) + # -> 0.0 + + +.. code-block:: python + + # In rare cases, it can happen that attacks return adversarials that are so close to the decision boundary, + # that they actually might end up on the other (correct) side if you pass them through the model again like + # above to get the adversarial class. This is because models are not numerically deterministic (on GPU, some + # operations such as `sum` are non-deterministic by default) and indepedent between samples (an input might + # be classified differently depending on the other inputs in the same batch). + + # You can always get the actual adversarial class that was observed for that sample by Foolbox by + # passing `unpack=False` to get the actual `Adversarial` objects: + attack = foolbox.attacks.FGSM(fmodel, distance=foolbox.distances.Linf) + adversarials = attack(images, labels, unpack=False) + + adversarial_classes = np.asarray([a.adversarial_class for a in adversarials]) + print(labels) + print(adversarial_classes) + print(np.mean(adversarial_classes == labels)) # will always be 0.0 + + # The `Adversarial` objects also provide a `distance` attribute. Note that the distances + # can be 0 (misclassified without perturbation) and inf (attack failed). + distances = np.asarray([a.distance.value for a in adversarials]) + print("{:.1e}, {:.1e}, {:.1e}".format(distances.min(), np.median(distances), distances.max())) + print("{} of {} attacks failed".format(sum(adv.distance.value == np.inf for adv in adversarials), len(adversarials))) + print("{} of {} inputs misclassified without perturbation".format(sum(adv.distance.value == 0 for adv in adversarials), len(adversarials))) + For more examples, have a look at the `documentation `__. @@ -78,26 +106,34 @@ Finally, the result can be plotted like this: import matplotlib.pyplot as plt + image = images[0] + adversarial = attack(images[:1], labels[:1])[0] + + # CHW to HWC + image = image.transpose(1, 2, 0) + adversarial = adversarial.transpose(1, 2, 0) + plt.figure() plt.subplot(1, 3, 1) plt.title('Original') - plt.imshow(image / 255) # division by 255 to convert [0, 255] to [0, 1] + plt.imshow(image) plt.axis('off') plt.subplot(1, 3, 2) plt.title('Adversarial') - plt.imshow(adversarial[:, :, ::-1] / 255) # ::-1 to convert BGR to RGB + plt.imshow(adversarial) plt.axis('off') plt.subplot(1, 3, 3) plt.title('Difference') - difference = adversarial[:, :, ::-1] - image + difference = adversarial - image plt.imshow(difference / abs(difference).max() * 0.2 + 0.5) plt.axis('off') plt.show() + .. image:: https://github.com/bethgelab/foolbox/raw/master/example.png @@ -111,7 +147,7 @@ PyTorch, Theano, Lasagne and MXNet are available, e.g. model = foolbox.models.PyTorchModel(torchmodel, bounds=(0, 255), num_classes=1000) # etc. -Different adversarial criteria such as Top-k, specific target classes or target probability +Different adversarial criteria such as Top-k, specific target classes or target probability values for the original class or the target class can be passed to the attack, e.g. .. code-block:: python diff --git a/docs/modules/adversarial.rst b/docs/modules/adversarial.rst index e81ce7d6..90356213 100644 --- a/docs/modules/adversarial.rst +++ b/docs/modules/adversarial.rst @@ -5,3 +5,4 @@ .. autoclass:: Adversarial :members: + :inherited-members: diff --git a/docs/user/adversarial.rst b/docs/user/adversarial.rst index 8a36e5eb..f31c56ec 100644 --- a/docs/user/adversarial.rst +++ b/docs/user/adversarial.rst @@ -4,6 +4,13 @@ Advanced The :class:`Adversarial` class provides an advanced way to specify the adversarial example that should be found by an attack and provides detailed information about the created adversarial. In addition, it provides a way to improve a previously found adversarial example by re-running an attack. +.. code-block:: python3 + + from foolbox.v1 import Adversarial + from foolbox.v1.attacks import LBFGSAttack + from foolbox.models import TenosrFlowModel + from foolbox.criteria import TargetClassProbability + Implicit ======== diff --git a/docs/user/development.rst b/docs/user/development.rst index 5fa3d5ff..d527b715 100644 --- a/docs/user/development.rst +++ b/docs/user/development.rst @@ -27,3 +27,5 @@ New Adversarial Attacks Foolbox makes it easy to develop new adversarial attacks that can be applied to arbitrary models. To implement an attack, simply subclass the :class:`Attack` class, implement the :meth:`__call__` method and decorate it with the :decorator:`call_decorator`. The :decorator:`call_decorator` will make sure that your :meth:`__call__` implementation will be called with an instance of the :class:`Adversarial` class. You can use this instance to ask for model predictions and gradients, get the original image and its label and more. In addition, the :class:`Adversarial` instance automatically keeps track of the best adversarial amongst all the inputs tested by the attack. That way, the implementation of the attack can focus on the attack logic. + +To implement an attack that can make use of the batch support introduced in Foolbox 2.0, implement the :meth:`as_generator` method and decorate it with the :decorator:`generator_decorator`. All model calls using the :class:`Adversarial` object should use ``yield``. diff --git a/docs/user/examples.rst b/docs/user/examples.rst index c6e704ea..61c60039 100644 --- a/docs/user/examples.rst +++ b/docs/user/examples.rst @@ -4,6 +4,90 @@ Examples Here you can find a collection of examples how Foolbox models can be created using different deep learning frameworks and some full-blown attack examples at the end. +Running an attack +================= + +Running a batch attack against a PyTorch model +---------------------------------------------- + +.. code-block:: python3 + + import foolbox + import numpy as np + import torchvision.models as models + + # instantiate model (supports PyTorch, Keras, TensorFlow (Graph and Eager), MXNet and many more) + model = models.resnet18(pretrained=True).eval() + preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3) + fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1), num_classes=1000, preprocessing=preprocessing) + + # get a batch of images and labels and print the accuracy + images, labels = foolbox.utils.samples(dataset='imagenet', batchsize=16, data_format='channels_first', bounds=(0, 1)) + print(np.mean(fmodel.forward(images).argmax(axis=-1) == labels)) + # -> 0.9375 + + # apply the attack + attack = foolbox.attacks.FGSM(fmodel) + adversarials = attack(images, labels) + # if the i'th image is misclassfied without a perturbation, then adversarials[i] will be the same as images[i] + # if the attack fails to find an adversarial for the i'th image, then adversarials[i] will all be np.nan + + # Foolbox guarantees that all returned adversarials are in fact in adversarials + print(np.mean(fmodel.forward(adversarials).argmax(axis=-1) == labels)) + # -> 0.0 + + # --- + + # In rare cases, it can happen that attacks return adversarials that are so close to the decision boundary, + # that they actually might end up on the other (correct) side if you pass them through the model again like + # above to get the adversarial class. This is because models are not numerically deterministic (on GPU, some + # operations such as `sum` are non-deterministic by default) and indepedent between samples (an input might + # be classified differently depending on the other inputs in the same batch). + + # You can always get the actual adversarial class that was observed for that sample by Foolbox by + # passing `unpack=False` to get the actual `Adversarial` objects: + attack = foolbox.attacks.FGSM(fmodel, distance=foolbox.distances.Linf) + adversarials = attack(images, labels, unpack=False) + + adversarial_classes = np.asarray([a.adversarial_class for a in adversarials]) + print(labels) + print(adversarial_classes) + print(np.mean(adversarial_classes == labels)) # will always be 0.0 + + # The `Adversarial` objects also provide a `distance` attribute. Note that the distances + # can be 0 (misclassified without perturbation) and inf (attack failed). + distances = np.asarray([a.distance.value for a in adversarials]) + print("{:.1e}, {:.1e}, {:.1e}".format(distances.min(), np.median(distances), distances.max())) + print("{} of {} attacks failed".format(sum(adv.distance.value == np.inf for adv in adversarials), len(adversarials))) + print("{} of {} inputs misclassified without perturbation".format(sum(adv.distance.value == 0 for adv in adversarials), len(adversarials))) + + +Running an attack on single sample against a Keras model +-------------------------------------------------------- + +.. code-block:: python3 + + import foolbox + import keras + import numpy as np + from keras.applications.resnet50 import ResNet50 + + # instantiate model + keras.backend.set_learning_phase(0) + kmodel = ResNet50(weights='imagenet') + preprocessing = (np.array([104, 116, 123]), 1) + fmodel = foolbox.models.KerasModel(kmodel, bounds=(0, 255), preprocessing=preprocessing) + + # get source image and label + image, label = foolbox.utils.imagenet_example() + + # apply attack on source image + # ::-1 reverses the color channels, because Keras ResNet50 expects BGR instead of RGB + attack = foolbox.v1.attacks.FGSM(fmodel) + adversarial = attack(image[:, :, ::-1], label) + # if the attack fails, adversarial will be None and a warning will be printed + + Creating a model ================ @@ -136,7 +220,7 @@ FGSM (GradientSignAttack) image, label = foolbox.utils.imagenet_example() # apply attack on source image - attack = foolbox.attacks.FGSM(fmodel) + attack = foolbox.v1.attacks.FGSM(fmodel) adversarial = attack(image[:,:,::-1], label) @@ -167,7 +251,7 @@ Creating an untargeted adversarial for a PyTorch model print('predicted class', np.argmax(fmodel.forward_one(image))) # apply attack on source image - attack = foolbox.attacks.FGSM(fmodel) + attack = foolbox.v1.attacks.FGSM(fmodel) adversarial = attack(image, label) print('adversarial class', np.argmax(fmodel.forward_one(adversarial))) diff --git a/docs/user/tutorial.rst b/docs/user/tutorial.rst index b7fec492..78efb3fc 100644 --- a/docs/user/tutorial.rst +++ b/docs/user/tutorial.rst @@ -49,10 +49,7 @@ Finally, we can create and apply the attack: from foolbox.attacks import LBFGSAttack attack = LBFGSAttack(model, criterion) - - image = np.asarray(Image.open('example.jpg')) - label = np.argmax(model.forward_one(image)) - + images, labels = foolbox.utils.samples(dataset='imagenet', batchsize=16, data_format='channels_last', bounds=(0, 255)) adversarial = attack(image, label=label) diff --git a/foolbox/VERSION b/foolbox/VERSION index 737f8e1e..227cea21 100644 --- a/foolbox/VERSION +++ b/foolbox/VERSION @@ -1 +1 @@ -2.0.0rc0 +2.0.0