# SPML HW3: Breaking Defenses & Black-Box Attacks

In [1]:
name = 'Amir Mohammad Ezzati'
std_id = '402212269'

In [2]:
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader

from torchvision import transforms
from torchvision.models import resnet18, mobilenet_v2
from torchvision.datasets.cifar import CIFAR10

from tqdm import trange, tqdm

torch.manual_seed(0)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/SPML/'

Mounted at /content/drive


# CIFAR10 Dataset (5 points)

In [4]:
norm_mean = (0.4914, 0.4822, 0.4465)
norm_std = (0.2023, 0.1994, 0.2010)
batch_size = 128

mu = torch.tensor(norm_mean).view(3,1,1).to(device)
std = torch.tensor(norm_std).view(3,1,1).to(device)

# TODO: Set the upper limit and lower limit possible for images
upper_limit = ((1 - mu) / std).view(3, 1, 1)
lower_limit = ((0 - mu) / std).view(3, 1, 1)

transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std)
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std)
])

trainset = CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)


classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:05<00:00, 31.2MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [5]:
len(trainset), len(testset)

(50000, 10000)

# Defensive Distillation (25 points)

[Defensive distillation](https://arxiv.org/abs/1511.04508) proceeds in four steps:

1.   **Train the teacher network**, by setting the temperature of the softmax to T during the
training phase.
2.   **Compute soft labels** by apply the teacher network to each instance in the training set, again evaluating the softmax at temperature T.
3.  **Train the distilled network** (a network with the same shape as the teacher network) on the soft labels, using softmax at temperature T.
4.  Finally, when running the distilled network at test time to classify new inputs, use temperature 1.



## Train the teacher

In [None]:
def train_step(model, dataloader, loss_fn, optimizer, temperature):
    # TODO: Return loss and accuracy for each epoch
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for inputs, targets in tqdm(dataloader, desc="Training", leave=False):
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)
        outputs = outputs / temperature  # Apply temperature scaling to logits

        loss = loss_fn(outputs, targets)
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct / total
    return epoch_loss, epoch_acc


def train_teacher(model, n_epochs, loader=trainloader, temp=100):
    # TODO: Log the accuracy and loss for each epoch
    model.to(device)
    optimizer = Adam(model.parameters(), lr=1e-3)
    loss_fn = CrossEntropyLoss()

    for epoch in range(n_epochs):
        loss, acc = train_step(model, loader, loss_fn, optimizer, temp)
        print(f"Epoch {epoch + 1}/{n_epochs} - Loss: {loss:.4f}, Accuracy: {acc:.4f}")


You can use a pre-trained resnet to speed up the training process.



In [None]:
teacher = resnet18(pretrained=True)
teacher.fc = nn.Linear(teacher.fc.in_features, 10)

train_teacher(teacher, 15)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 164MB/s]


Epoch 1/15 - Loss: 1.3704, Accuracy: 0.6797




Epoch 2/15 - Loss: 0.6588, Accuracy: 0.7917




Epoch 3/15 - Loss: 0.4649, Accuracy: 0.8507




Epoch 4/15 - Loss: 0.3493, Accuracy: 0.8882




Epoch 5/15 - Loss: 0.2555, Accuracy: 0.9179




Epoch 6/15 - Loss: 0.1972, Accuracy: 0.9368




Epoch 7/15 - Loss: 0.1439, Accuracy: 0.9554




Epoch 8/15 - Loss: 0.1172, Accuracy: 0.9625




Epoch 9/15 - Loss: 0.1002, Accuracy: 0.9684




Epoch 10/15 - Loss: 0.0790, Accuracy: 0.9743




Epoch 11/15 - Loss: 0.0724, Accuracy: 0.9768




Epoch 12/15 - Loss: 0.0650, Accuracy: 0.9789




Epoch 13/15 - Loss: 0.0576, Accuracy: 0.9815




Epoch 14/15 - Loss: 0.0496, Accuracy: 0.9842


                                                           

Epoch 15/15 - Loss: 0.0540, Accuracy: 0.9821




In [None]:
model_name = "teacher_pretrained_resnet18_15epochs.pth"
model_PATH = base_path + f"{model_name}"
torch.save(teacher.state_dict(), model_PATH)

## Test the teacher

In [None]:
teacher = resnet18(pretrained=False)
teacher.fc = nn.Linear(teacher.fc.in_features, 10)
teacher = teacher.to(device)
teacher.load_state_dict(torch.load(base_path + "teacher_pretrained_resnet18_15epochs.pth"))

  teacher.load_state_dict(torch.load(base_path + "teacher_pretrained_resnet18_15epochs.pth"))


<All keys matched successfully>

In [6]:
def test_clean(model, dataloader=testloader):
    # TODO: Return the clean accuracy of the model
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in tqdm(dataloader, desc="Testing", leave=False):
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)

            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    accuracy = 100.0 * correct / total
    return accuracy

Print the clean accuracy of the teacher.

In [None]:
print(f'Teacher Accuracy {test_clean(teacher):.2f}%')

                                                        

Teacher Accuracy 81.04%




## Train the student

In [None]:
def distill(model, teacher, dataloader, optimizer, T):
    # TODO: Get soft labels from teacher model
    # TODO: Get student model outputs
    # TODO: Compute the distillation loss
    # TODO: Return the accuracy (on real labels) and loss (on soft labels)

    model.train()
    teacher.eval()
    loss_fn = nn.KLDivLoss(reduction='batchmean')  # KL divergence for soft labels

    running_loss = 0
    correct = 0
    total = 0

    for inputs, targets in tqdm(dataloader, desc="Distillation Training", leave=False):
        inputs, targets = inputs.to(device), targets.to(device)

        with torch.no_grad():
            teacher_outputs = teacher(inputs) / T
        student_outputs = model(inputs) / T  # Apply temperature scaling to both logits

        teacher_probs = F.softmax(teacher_outputs, dim=1)
        student_log_probs = F.log_softmax(student_outputs, dim=1)

        distill_loss = loss_fn(student_log_probs, teacher_probs) * (T**2)

        optimizer.zero_grad()
        distill_loss.backward()
        optimizer.step()

        running_loss += distill_loss.item()

        _, predicted = (student_outputs * T).max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = correct / total
    return epoch_acc, epoch_loss


def train_student(model, teacher, n_epochs, loader=trainloader, temp=100):
    # TODO: Log the accuracy and loss for each epoch
    model.to(device)
    teacher.to(device)
    optimizer = Adam(model.parameters(), lr=1e-3)

    for epoch in range(n_epochs):
        acc, loss = distill(model, teacher, loader, optimizer, temp)
        print(f"Epoch {epoch + 1}/{n_epochs} - Loss: {loss:.4f}, Accuracy: {acc:.4f}")

This time use a `resnet18` without the pretrained weights.

In [None]:
student = resnet18(pretrained=False)
student.fc = nn.Linear(student.fc.in_features, 10)

train_student(student, teacher, 30)



Epoch 1/30 - Loss: 15363.2048, Accuracy: 0.4780




Epoch 2/30 - Loss: 9921.0126, Accuracy: 0.6413




Epoch 3/30 - Loss: 7770.4345, Accuracy: 0.7185




Epoch 4/30 - Loss: 6329.2704, Accuracy: 0.7675




Epoch 5/30 - Loss: 5238.1133, Accuracy: 0.8086




Epoch 6/30 - Loss: 4242.9113, Accuracy: 0.8449




Epoch 7/30 - Loss: 3384.1316, Accuracy: 0.8755




Epoch 8/30 - Loss: 2710.7089, Accuracy: 0.9000




Epoch 9/30 - Loss: 2119.9015, Accuracy: 0.9206




Epoch 10/30 - Loss: 1658.0158, Accuracy: 0.9375




Epoch 11/30 - Loss: 1360.9280, Accuracy: 0.9495




Epoch 12/30 - Loss: 1060.2816, Accuracy: 0.9601




Epoch 13/30 - Loss: 1006.6803, Accuracy: 0.9619




Epoch 14/30 - Loss: 915.4780, Accuracy: 0.9649




Epoch 15/30 - Loss: 869.4710, Accuracy: 0.9664




Epoch 16/30 - Loss: 796.1015, Accuracy: 0.9695




Epoch 17/30 - Loss: 695.7563, Accuracy: 0.9718




Epoch 18/30 - Loss: 697.4539, Accuracy: 0.9733




Epoch 19/30 - Loss: 644.4445, Accuracy: 0.9740




Epoch 20/30 - Loss: 605.4570, Accuracy: 0.9758




Epoch 21/30 - Loss: 636.2325, Accuracy: 0.9741




Epoch 22/30 - Loss: 587.2774, Accuracy: 0.9766




Epoch 23/30 - Loss: 587.2548, Accuracy: 0.9771




Epoch 24/30 - Loss: 540.9039, Accuracy: 0.9781




Epoch 25/30 - Loss: 554.4584, Accuracy: 0.9771




Epoch 26/30 - Loss: 547.2722, Accuracy: 0.9777




Epoch 27/30 - Loss: 531.4476, Accuracy: 0.9779




Epoch 28/30 - Loss: 505.4493, Accuracy: 0.9794




Epoch 29/30 - Loss: 433.5928, Accuracy: 0.9815


                                                                        

Epoch 30/30 - Loss: 485.8684, Accuracy: 0.9796




In [None]:
model_name = "student_fromscratch_resnet18_30epochs.pth"
model_PATH = base_path + f"{model_name}"
torch.save(student.state_dict(), model_PATH)

## Test the student

In [36]:
student = resnet18(pretrained=False)
student.fc = nn.Linear(student.fc.in_features, 10)
student = student.to(device)
student.load_state_dict(torch.load(base_path + "student_fromscratch_resnet18_30epochs.pth"))

  student.load_state_dict(torch.load(base_path + "student_fromscratch_resnet18_30epochs.pth"))


<All keys matched successfully>

In [37]:
print(f'Student Accuracy {test_clean(student):.2f}%')

                                                        

Student Accuracy 77.16%




# Attack (15 points)

Implement the FGSM attack and the `test_attack` funcion to report the robust accuracy for different values of epsilon.

In [24]:
def attack_fgsm(model, x, y, epsilon, temp):
    # TODO: Return perturbed input
    ce = CrossEntropyLoss()
    x_adv = x.clone().detach().requires_grad_(True)
    model.eval()

    with torch.enable_grad():
        outputs = model(x_adv) / temp
        loss_ = ce(outputs, y)
        loss_.backward()

        x_adv = x_adv + epsilon * x_adv.grad.sign()
        x_adv = torch.clamp(x_adv, lower_limit, upper_limit) # Clip to valid range

    return x_adv.detach()

def attack_pgd(model, x, y, epsilon, temp, alpha=0.2, num_iters=10):
    # TODO: Return perturbed input

    ce = CrossEntropyLoss()
    x_adv = x.clone().detach().to(device)
    model.eval()

    for _ in range(num_iters):
        x_adv.requires_grad_(True)
        # model.zero_grad()

        outputs = model(x_adv) / temp
        loss = ce(outputs, y)
        loss.backward()

        x_adv = x_adv + alpha * x_adv.grad.sign()

        # Project back to epsilon-ball around x
        perturbation = torch.clamp(x_adv - x, -epsilon, epsilon)
        x_adv = torch.clamp(x + perturbation, lower_limit, upper_limit).detach().requires_grad_(True)

    return x_adv.detach()


def test_attack(model, epsilon, temp=100, attack=attack_fgsm, loader=testloader):
    # TODO: Return the robust accuracy for FGSM or PGD
    model.eval()
    correct = 0
    total = 0

    for inputs, targets in tqdm(loader, desc=f"Testing attack (epsilon={epsilon})", leave=False):
        inputs, targets = inputs.to(device), targets.to(device)

        # Generate adversarial examples
        inputs_adv = attack(model, inputs, targets, epsilon, temp)

        # Evaluate the model on adversarial examples
        outputs = model(inputs_adv)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    robust_accuracy = 100.0 * correct / total
    return robust_accuracy


Report the robust accuracy of the teacher for `ϵ = [1, 2, 4, 8, 16]`.

In [None]:
epsilons = [1, 2, 4, 8, 16]

for eps in epsilons:
    # TODO:
    acc = test_attack(teacher, eps/255, attack=attack_fgsm, temp=100)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

    acc = test_attack(teacher, eps/255, attack=attack_pgd, temp=100)
    print(f'PGD  with ϵ={eps}/255 has Accuracy: {acc:.2f}%')



FGSM with ϵ=1/255 has Accuracy: 74.53%




PGD  with ϵ=1/255 has Accuracy: 74.38%




FGSM with ϵ=2/255 has Accuracy: 68.32%




PGD  with ϵ=2/255 has Accuracy: 67.76%




FGSM with ϵ=4/255 has Accuracy: 55.76%




PGD  with ϵ=4/255 has Accuracy: 54.20%




FGSM with ϵ=8/255 has Accuracy: 38.68%




PGD  with ϵ=8/255 has Accuracy: 34.60%




FGSM with ϵ=16/255 has Accuracy: 20.72%


                                                                                             

PGD  with ϵ=16/255 has Accuracy: 15.06%




Do the same for the student:

In [None]:
for eps in epsilons:
    # TODO:
    acc = test_attack(student, eps/255, attack=attack_fgsm, temp=1)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

    acc = test_attack(student, eps/255, attack=attack_pgd, temp=1)
    print(f'PGD  with ϵ={eps}/255 has Accuracy: {acc:.2f}%')



FGSM with ϵ=1/255 has Accuracy: 73.98%




PGD  with ϵ=1/255 has Accuracy: 73.83%




FGSM with ϵ=2/255 has Accuracy: 73.62%




PGD  with ϵ=2/255 has Accuracy: 73.54%




FGSM with ϵ=4/255 has Accuracy: 73.56%




PGD  with ϵ=4/255 has Accuracy: 73.52%




FGSM with ϵ=8/255 has Accuracy: 73.55%




PGD  with ϵ=8/255 has Accuracy: 73.51%




FGSM with ϵ=16/255 has Accuracy: 73.53%


                                                                                             

PGD  with ϵ=16/255 has Accuracy: 73.51%




What do you see?

`your response:`

The teacher model's accuracy drops significantly as ϵ increases, showing vulnerability to adversarial attacks (e.g., 74% at ϵ=1/255 to ~15% at ϵ=16/255). PGD is slightly stronger than FGSM, highlighting its iterative effectiveness.

In contrast, the student model maintains stable accuracy (~73.5%) across all ϵ values, demonstrating strong robustness due to defensive distillation. This robustness arises because the student model, trained with a high temperature (T=100) and evaluated at T=1, exhibits higher confidence in its predictions. This high confidence causes gradients to approach zero, making adversarial perturbations (from FGSM and PGD) ineffective.

# Transferring Adversarial Examples (15 points)

Train yet another model to be used as the surrogate. (set temperature to 1)

In [None]:
model = resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, 10)
model = model.to(device)

optimizer = Adam(model.parameters(), lr=1e-3)
criterion = CrossEntropyLoss()

def train_surrogate(model, dataloader, epochs=15):
    model.train()
    for epoch in range(epochs):
        running_loss, correct, total = 0, 0, 0
        for x, y in tqdm(dataloader):
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            pred = outputs.argmax(dim=1)
            correct += (pred == y).sum().item()
            total += y.size(0)

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss/len(dataloader):.4f}, Accuracy: {100 * correct / total:.2f}%")

train_surrogate(model, trainloader)

100%|██████████| 391/391 [00:22<00:00, 17.47it/s]


Epoch 1/15, Loss: 1.3603, Accuracy: 51.05%


100%|██████████| 391/391 [00:19<00:00, 19.91it/s]


Epoch 2/15, Loss: 0.9665, Accuracy: 65.78%


100%|██████████| 391/391 [00:17<00:00, 21.90it/s]


Epoch 3/15, Loss: 0.7888, Accuracy: 72.43%


100%|██████████| 391/391 [00:17<00:00, 21.85it/s]


Epoch 4/15, Loss: 0.6563, Accuracy: 77.11%


100%|██████████| 391/391 [00:19<00:00, 20.42it/s]


Epoch 5/15, Loss: 0.5556, Accuracy: 80.68%


100%|██████████| 391/391 [00:18<00:00, 21.40it/s]


Epoch 6/15, Loss: 0.4545, Accuracy: 84.03%


100%|██████████| 391/391 [00:19<00:00, 19.98it/s]


Epoch 7/15, Loss: 0.3795, Accuracy: 86.88%


100%|██████████| 391/391 [00:18<00:00, 21.68it/s]


Epoch 8/15, Loss: 0.3031, Accuracy: 89.34%


100%|██████████| 391/391 [00:19<00:00, 20.21it/s]


Epoch 9/15, Loss: 0.2442, Accuracy: 91.37%


100%|██████████| 391/391 [00:18<00:00, 21.66it/s]


Epoch 10/15, Loss: 0.1997, Accuracy: 92.96%


100%|██████████| 391/391 [00:18<00:00, 21.53it/s]


Epoch 11/15, Loss: 0.1669, Accuracy: 94.14%


100%|██████████| 391/391 [00:18<00:00, 20.81it/s]


Epoch 12/15, Loss: 0.1377, Accuracy: 95.12%


100%|██████████| 391/391 [00:19<00:00, 20.43it/s]


Epoch 13/15, Loss: 0.1157, Accuracy: 95.91%


100%|██████████| 391/391 [00:18<00:00, 21.30it/s]


Epoch 14/15, Loss: 0.1081, Accuracy: 96.24%


100%|██████████| 391/391 [00:18<00:00, 20.97it/s]

Epoch 15/15, Loss: 0.1000, Accuracy: 96.51%





In [None]:
model_name = "surrogate_model.pth"
model_PATH = base_path + f"{model_name}"
torch.save(model.state_dict(), model_PATH)

Print the surrogate accuracy.

In [None]:
model = resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, 10)
model = model.to(device)
model.load_state_dict(torch.load(base_path + "surrogate_model.pth"))

  model.load_state_dict(torch.load(base_path + "surrogate_model.pth"))


<All keys matched successfully>

In [None]:
print(f'Model Accuracy {test_clean(model):.2f}%')

                                                        

Model Accuracy 76.13%




Report the accuracy of the surrogate for `ϵ = [1, 2, 4, 8, 16]`.

In [None]:
epsilons = [1, 2, 4, 8, 16]

for eps in epsilons:
    # TODO:
    acc = test_attack(model, eps/255, attack=attack_fgsm, temp=1)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

    acc = test_attack(model, eps/255, attack=attack_pgd, temp=1)
    print(f'PGD  with ϵ={eps}/255 has Accuracy: {acc:.2f}%')



FGSM with ϵ=1/255 has Accuracy: 71.90%




PGD  with ϵ=1/255 has Accuracy: 71.85%




FGSM with ϵ=2/255 has Accuracy: 67.54%




PGD  with ϵ=2/255 has Accuracy: 67.27%




FGSM with ϵ=4/255 has Accuracy: 58.95%




PGD  with ϵ=4/255 has Accuracy: 57.88%




FGSM with ϵ=8/255 has Accuracy: 43.40%




PGD  with ϵ=8/255 has Accuracy: 41.08%




FGSM with ϵ=16/255 has Accuracy: 22.81%


                                                                                             

PGD  with ϵ=16/255 has Accuracy: 19.55%




Implement the following functions to transfer attacks from a surrogate model to an oracle.

In [None]:
def transfer_attack(oracle, model, eps, loader=testloader):
    # TODO: Attack the model and report the accuracy of the oracle
    oracle.eval()
    model.eval()
    correct, total = 0, 0
    for x, y in tqdm(loader):
        x, y = x.to(device), y.to(device)

        # Generate adversarial examples using the surrogate
        x_adv = attack_fgsm(model, x, y, eps, temp=1)  # or use attack_pgd

        # Evaluate the oracle on these adversarial examples
        with torch.no_grad():
            outputs = oracle(x_adv)
            pred = outputs.argmax(dim=1)
            correct += (pred == y).sum().item()
            total += y.size(0)

    return 100 * correct / total

Transfer attacks for `ϵ = [1, 2, 4, 8, 16]` from your model to the student.

In [None]:
epsilons = [1, 2, 4, 8, 16]

for eps in epsilons:
    acc = transfer_attack(student, model, eps/255)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

100%|██████████| 79/79 [00:03<00:00, 21.76it/s]


FGSM with ϵ=1/255 has Accuracy: 76.38%


100%|██████████| 79/79 [00:04<00:00, 16.71it/s]


FGSM with ϵ=2/255 has Accuracy: 75.46%


100%|██████████| 79/79 [00:04<00:00, 18.53it/s]


FGSM with ϵ=4/255 has Accuracy: 73.91%


100%|██████████| 79/79 [00:03<00:00, 22.48it/s]


FGSM with ϵ=8/255 has Accuracy: 70.64%


100%|██████████| 79/79 [00:03<00:00, 22.71it/s]

FGSM with ϵ=16/255 has Accuracy: 63.14%





- What can be inferred from these results?

- How are the accuracies of the student and the surrogate under attack related?

- Does Defensive Distillation obfuscate the gradients? Why?


`your response:`

- The student model shows robustness to direct attacks but is vulnerable to surrogate-generated adversarial examples as the accuracy drops significantly with increasing ε. This result demonstrates that the student's robustness to adversarial examples is fake, as it fails to generalize against transferable adversarial attacks.

- Both the surrogate and student models' accuracies drop as ε increases, indicating that a more successful attack on the surrogate model is likely to transfer more effectively to the student model. However, the accuracy drop in the surrogate is more significant than in the student because the attack is applied directly to the surrogate, while the student model only experiences the transferred adversarial examples.

- Yes, defensive distillation obfuscates the gradients. The high confidence of the student model (due to T=100 during training and T=1 during test) results in smoother gradients, reducing the impact of direct FGSM and PGD attacks. However, adversarial examples generated from a surrogate exploit transferability, bypassing this obfuscation.

# ZOO Based Black-Box Attacks (25 points)

Based on [Black-box Adversarial Attacks with Limited Queries and Information](https://arxiv.org/abs/1804.08598) you must first calculate the estimate of the gradients, and next attack the model based on your estimates.

In [None]:
from torch.nn.functional import cross_entropy
def nes_gradient_estimate(model, x, y, epsilon, num_samples, sigma):
    # TODO: Return the estimated gradient
    grad_estimate = torch.zeros_like(x).to(x.device)
    model.eval()

    for _ in range(num_samples):
        u = torch.randn_like(x).to(x.device)
        perturbed_plus = torch.clamp(x + sigma * u, lower_limit, upper_limit)
        perturbed_minus = torch.clamp(x - sigma * u, lower_limit, upper_limit)

        with torch.no_grad():
            loss_plus = cross_entropy(model(perturbed_plus), y, reduction='none')
            loss_minus = cross_entropy(model(perturbed_minus), y, reduction='none')

        grad_estimate += (loss_plus - loss_minus).view(-1, 1, 1, 1) * u

    grad_estimate /= (2 * num_samples * sigma)
    return grad_estimate

In [None]:
def partial_information_attack(model, x, y, epsilon, num_samples, sigma, num_steps, alpha):
    # TODO: Return the perturbed image
    x_adv = x.clone().detach().requires_grad_(True).to(x.device)
    model.eval()

    for _ in range(num_steps):
        # Estimate the gradient using NES
        grad_estimate = nes_gradient_estimate(model, x_adv, y, epsilon, num_samples, sigma)

        # Update the adversarial image
        x_adv = x_adv + alpha * grad_estimate.sign()

        x_adv = torch.clamp(x_adv, x - epsilon, x + epsilon)
        x_adv = torch.clamp(x_adv, lower_limit, upper_limit)  # Ensure valid pixel range

    return x_adv.detach()

Now run this attack on your models and report the results. (You **DON'T** need to run the attack for the entire test dataset as this will take a lot of time!)

In [None]:
def partial_information_attack_batch(model, x, y, epsilon, num_samples, sigma, num_steps, alpha):
  batch_size = x.shape[0]
  x_adv_batch = []

  for i in range(batch_size):
      x_i = x[i:i+1]
      y_i = y[i:i+1]

      x_adv_i = partial_information_attack(model, x_i, y_i, epsilon, num_samples, sigma, num_steps, alpha)
      x_adv_batch.append(x_adv_i)

  x_adv_batch = torch.cat(x_adv_batch, dim=0)
  return x_adv_batch


In [None]:
epsilon = 16 / 255
num_samples = 500
sigma = 0.1
num_steps = 3
alpha = 0.2

test_samples = next(iter(testloader))
x_test, y_test = test_samples[0][:50].to(device), test_samples[1][:50].to(device)

# Perform the attack on the teacher and student models
x_adv_teacher = partial_information_attack_batch(teacher, x_test, y_test, epsilon, num_samples, sigma, num_steps, alpha)
x_adv_student = partial_information_attack_batch(student, x_test, y_test, epsilon, num_samples, sigma, num_steps, alpha)

In [None]:
# Evaluate the accuracy of the models on adversarial examples
with torch.no_grad():
    teacher_acc = (teacher(x_adv_teacher).argmax(1) == y_test).sum().item()/y_test.size(0) * 100
    student_acc = (student(x_adv_student).argmax(1) == y_test).sum().item()/y_test.size(0) * 100

print(f"Teacher Model Robust Accuracy under ZOO Attack: {teacher_acc:.2f}%")
print(f"Student Model Robust Accuracy under ZOO Attack: {student_acc:.2f}%")

Teacher Model Robust Accuracy under ZOO Attack: 68.00%
Student Model Robust Accuracy under ZOO Attack: 78.00%


# Adversarially Robust Distillation (15 points)

In this section we are going to test another type of distillation to see if this method is robust. This technique is [Adversarially Robust Distillation](https://arxiv.org/abs/1905.09747).



1.   We will try to distill a robsut teacher from [Robust Bench](https://robustbench.github.io/) onto a smaller architecture.
2.   We minimize the KL-Divergence between the logits of the student and teacher to ensure fidelity. (You can also incorporate the classification loss as mentioned in the paper but you can choose to ignore it as well)
3.   At each step of the distillation you will attack the student (you can use either FGSM or PGD) and find an adversarial example $X + \delta$ for data point $X$. Next you will minimize $t^2 \times \text{KL}(S(X+\delta), T(X))$ where $S$ and $T$ are the student and teacher networks respectively.



In [8]:
! pip install git+https://github.com/RobustBench/robustbench.git -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.8/54.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for robustbench (setup.py) ... [?25l[?25hdone
  Building wheel for autoattack (setup.py) ... [?25l[?25hdone


In [9]:
transform_train = transforms.Compose([
    transforms.ToTensor(),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
])

upper_limit = torch.tensor([1, 1, 1]).view(3, 1, 1).to(device)
lower_limit = torch.tensor([0, 0, 0]).view(3, 1, 1).to(device)

trainset = CIFAR10(root='./data', train=True, download=True, transform=transform_train)
train_dataloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=1)

testset = CIFAR10(root='./data', train=False, download=True, transform=transform_test)
test_dataloader = DataLoader(testset, batch_size=128, shuffle=False, num_workers=1)

Files already downloaded and verified
Files already downloaded and verified


In [10]:
from robustbench.utils import load_model

teacher = load_model(model_name='Gowal2021Improving_R18_ddpm_100m', dataset='cifar10', threat_model='Linf')
teacher = teacher.to(device)



Downloading models/cifar10/Linf/Gowal2021Improving_R18_ddpm_100m.pt (gdrive_id=1-0EuCJashqSXEkkd1DOzFA4tH8KL2kim).


Downloading...
From (original): https://drive.google.com/uc?id=1-0EuCJashqSXEkkd1DOzFA4tH8KL2kim
From (redirected): https://drive.google.com/uc?id=1-0EuCJashqSXEkkd1DOzFA4tH8KL2kim&confirm=t&uuid=f62da137-eeba-4932-94c4-b0be15c0ed0e
To: /content/models/cifar10/Linf/Gowal2021Improving_R18_ddpm_100m.pt
100%|██████████| 50.3M/50.3M [00:01<00:00, 49.0MB/s]
  checkpoint = torch.load(model_path, map_location=torch.device('cpu'))


In [35]:
# Clean accurcy
print(f'Teacher Clean Accuracy {test_clean(teacher, dataloader=test_dataloader):.2f}%')

# FGSM with eps=8/255
acc = test_attack(teacher, 8/255, attack=attack_fgsm, loader=test_dataloader, temp=1)
print(f'FGSM with ϵ=8/255 has Accuracy: {acc:.2f}%')

# PGD with eps=8/255
acc = test_attack(teacher, 8/255, attack=attack_pgd, loader=test_dataloader, temp=1)
print(f'PGD with ϵ=8/255 has Accuracy: {acc:.2f}%')



Teacher Clean Accuracy 87.35%




FGSM with ϵ=8/255 has Accuracy: 66.38%


                                                                                             

PGD with ϵ=8/255 has Accuracy: 62.67%




In [31]:
def ard(student, teacher, dataloader, optimizer, eps, attack, t):
    # TODO
    KL_loss = nn.KLDivLoss()
    total_loss, correct, total = 0.0, 0, 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)

        if attack == 'FGSM':
            x_adv = attack_fgsm(student, x, y, epsilon=eps, temp=t)
        elif attack == 'PGD':
            x_adv = attack_pgd(student, x, y, epsilon=eps, temp=t)

        student.train()

        student_logits = student(x_adv) / t
        teacher_logits = teacher(x) / t

        # Compute the distillation loss (KL divergence)
        kl_loss = KL_loss(F.log_softmax(student_logits, dim=1),
                          F.softmax(teacher_logits, dim=1)) * (t**2)

        loss = kl_loss #+ ce_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, predicted = student(x).max(1)
        correct += predicted.eq(y).sum().item()
        total += y.size(0)

    average_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return average_loss, accuracy


def adv_train_student(model, teacher, n_epochs, eps=8/255, temp=1, loader=train_dataloader):
    # TODO
    model.train()
    teacher.eval()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(n_epochs):
        loss, acc = ard(model, teacher, loader, optimizer, eps, attack='FGSM', t=temp)
        print(f"Epoch {epoch + 1}/{n_epochs} - Loss: {loss:.4f}, Accuracy: {acc:.4f}")

In [32]:
student = mobilenet_v2(weights=None, num_classes=10)
student = student.to(device)

temperature = 1
# TODO: Adjust and train the student
adv_train_student(student, teacher, n_epochs=15, temp=temperature, loader=train_dataloader)



Epoch 1/15 - Loss: 0.2038, Accuracy: 0.1529
Epoch 2/15 - Loss: 0.1800, Accuracy: 0.2334
Epoch 3/15 - Loss: 0.1736, Accuracy: 0.2657
Epoch 4/15 - Loss: 0.1703, Accuracy: 0.2744
Epoch 5/15 - Loss: 0.1665, Accuracy: 0.2965
Epoch 6/15 - Loss: 0.1637, Accuracy: 0.3105
Epoch 7/15 - Loss: 0.1605, Accuracy: 0.3357
Epoch 8/15 - Loss: 0.1582, Accuracy: 0.3653
Epoch 9/15 - Loss: 0.1556, Accuracy: 0.3809
Epoch 10/15 - Loss: 0.1536, Accuracy: 0.3922
Epoch 11/15 - Loss: 0.1525, Accuracy: 0.4044
Epoch 12/15 - Loss: 0.1505, Accuracy: 0.4191
Epoch 13/15 - Loss: 0.1490, Accuracy: 0.4292
Epoch 14/15 - Loss: 0.1480, Accuracy: 0.4409
Epoch 15/15 - Loss: 0.1467, Accuracy: 0.4456


Now report the accuracy of the student on the test dataset.

In [34]:
# TODO: Clean accurcy
print(f'Student Clean Accuracy {test_clean(student, dataloader=test_dataloader):.2f}%')

# TODO: FGSM with eps=8/255
acc = test_attack(student, 8/255, attack=attack_fgsm, loader=test_dataloader, temp=1)
print(f'FGSM with ϵ=8/255 has Accuracy: {acc:.2f}%')

# TODO: PGD with eps=8/255
acc = test_attack(student, 8/255, attack=attack_pgd, loader=test_dataloader, temp=1)
print(f'PGD with ϵ=8/255 has Accuracy: {acc:.2f}%')



Student Clean Accuracy 43.13%




FGSM with ϵ=8/255 has Accuracy: 31.23%


                                                                                             

PGD with ϵ=8/255 has Accuracy: 30.62%


