# Demo : Crafting ensemble adversarial attacks & testing their transferability
### Machine Learning Security Project 2 - Jules COOPER, Leonie MAIER

**Project instructions:**
Consider 3 models from RobustBench (CIFAR10, L-inf) and craft universal (and untargeted) adversarial examples aimed to fool the 3 models at the same time. Evaluate transferability of such adversarial examples to other 7 models.

In [None]:
try:
    import secml
    import foolbox
except ImportError:
    %pip install git+https://github.com/pralab/secml
    %pip install foolbox

The models from RobustBench (Linf, CIFAR-10) tested were:

| Model name | Model ID | Clean Accuracy  | Robust Accuracy  |  Architecture  |
|----|----|----|----|---|
| secml_model1 | Ding2020MMA | 84.36% | 41.44% | WideResNet-28-4 |
| secml_model2 | Wong2020Fast | 83.34% | 43.21% | PreActResNet-18 |
| secml_model3 | Andriushchenko2020Understanding | 79.84% | 43.93% | PreActResNet-18 |

In [None]:
from secml.data.loader.c_dataloader_cifar import CDataLoaderCIFAR10
from secml.ml.classifiers import CClassifierPyTorch
from secml.ml.features.normalization import CNormalizerMinMax


train_ds, test_ds = CDataLoaderCIFAR10().load()
dataset_labels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
normalizer = CNormalizerMinMax().fit(train_ds.X)

from robustbench.utils import load_model

model1 = load_model(model_name='Ding2020MMA', dataset='cifar10', threat_model='Linf')
secml_model1 = CClassifierPyTorch(model1, input_shape=(3,32,32), pretrained=True)
model2 = load_model(model_name='Wong2020Fast', dataset='cifar10', threat_model='Linf')
secml_model2 = CClassifierPyTorch(model2, input_shape=(3,32,32), pretrained=True)
model3 = load_model(model_name='Andriushchenko2020Understanding', dataset='cifar10', threat_model='Linf')
secml_model3 = CClassifierPyTorch(model3, input_shape=(3,32,32), pretrained=True)

In [None]:
from secml.ml.classifiers.loss import CLossCrossEntropy
from secml.array import CArray


def pgd_linf_mult_untargeted(x, y, models, eps, alpha, steps):
    """Performs a Projected Gradient Descent attack on one or more models by averaging their gradient at"""

    # we need the gradient of the softmax
    for clf in models :
            clf.softmax_outputs = True
    
    # Using cross entropy as loss function
    loss_func = CLossCrossEntropy()
    x_adv = x.deepcopy()  

    for i in range(steps):
        gradients = []
        # Follow progression of attack
        if i % 10 == 0 : 
                print((i/steps)*100,"%")

        # Calculate scores and gradients of each model        
        for clf_index in range(len(models)):
                                       
            scores = models[clf_index].decision_function(x_adv)

            # gradient of the loss considering the output logits
            loss_gradient = loss_func.dloss(y_true=y, score=scores)
            # gradient of the output logits considering the input
            clf_gradient = models[clf_index].grad_f_x(x_adv, y)

            # gradient of the loss function considering the input
            gradient = clf_gradient * loss_gradient

            gradients.append(gradient)


        gradient = gradients[0]

        # If multiple models, calculate the mean gradient
        if (len(gradients) > 1):
            gradient = CArray([arr.tondarray() for arr in gradients])
            gradient = gradient.mean(axis=0)

        # Normalize mean gradient
        gradient = gradient.sign()

        # make step
        x_adv = x_adv + alpha * gradient

        # project inside epsilon-ball : For Linf, only need to keep it between epsilon boundaries
        delta = (x_adv - x).clip(-eps,eps)
        x_adv = x + delta

        # force input bounds
        x_adv = x_adv.clip(0, 1)

    predict = []
    

    for clf in models:
        #Restore outputs
        clf.softmax_outputs = False
        # Add prediction of each classifier
        predict.append(clf.predict(x_adv))

    return x_adv, predict

In [None]:
import random

def multiple_runs(max_iterations, models, eps, alpha, steps):
    """Returns first successful attack"""

    adversarial = False
    n = 0
    while n < max_iterations and not adversarial :
        # Choosing a random image from the test set
        i = random.randint(0,10000)
        pt = test_ds[i, :]
        x0, y0 = pt.X, pt.Y

        # Normalizing the input
        x0 = normalizer.transform(x0)

        print(f"attack_number {n} on image {i}")
        print(f"Starting point has label: {dataset_labels[y0.item()]}")

        # Run one (combined) attack 
        x_adv, y_advs = pgd_linf_mult_untargeted(x0, y0, models, eps, alpha, steps)
        
        adversarial = True
        for y_adv in y_advs:
            print(f"Adversarial point has label: {dataset_labels[y_adv.item()]}")
            # If at least one model still predicts real label after attack, unsuccessful attack
            if y_adv.item() == y0.item():
                adversarial = False
    return x_adv, i

    

In [None]:
# High number of steps to try for best accuracy
steps = 100
# Standard Linf budget
eps = 8/255
alpha = eps/2

# Running on more robust models
x_adv, image_index = multiple_runs(20,[secml_model1, secml_model2, secml_model3],eps,alpha,steps)

## Transferability

In [None]:
#loading every models to the variable test_models
transfer_models_name = ["Amini2024MeanSparse_Ra_WRN_70_16", "Gowal2021Improving_70_16_ddpm_100m", "Cui2023Decoupled_WRN-28-10", "Wang2023Better_WRN-28-10", "Rebuffi2021Fixing_106_16_cutmix_ddpm", "Huang2022Revisiting_WRN-A4", "Kang2021Stable"]
test_models = []
for model_name_t in transfer_models_name:

    model = load_model(model_name=model_name_t, dataset='cifar10', threat_model='Linf')
    secml_model = CClassifierPyTorch(model, input_shape=(3,32,32), pretrained=True)
    test_models.append(secml_model)

In [None]:
pt = test_ds[image_index, :]
x0, y0 = pt.X, pt.Y
success_models=[]
total_local_models=[]
total_run = 0
number_success = 0

for i in range(len(test_models)):
    success_models.append(0)
    total_local_models.append(0)

# run the prediction with the adversarial example
for i in range(len(test_models)):
    
    y_pred = test_models[i].predict(x_adv)
    if (y_pred.item() != y0.item()):
        number_success+=1
        success_models[i]+=1

    total_local_models[i]+=1
    total_run +=1

print(f"{total_local_models[0]} attacks were transfered other {len(test_models)} models")
print(f"Within the {total_run} attacks, {number_success} succeeded: accuracy {number_success/total_run*100}%")
print("Individual model statistics")


for i in range(len(test_models)):
    if (success_models[i]):
        print(f"\t transferability attack successful on model {i}")
    else:
        print(f"\t transferability attack unsuccessful on model {i}")