# 02 - FGSM & PGD Adversarial Attacks

**Goal**: Evaluate model vulnerability to adversarial attacks.

**Research Questions**:
- How does adversarial noise affect CNN accuracy?
- How does model confidence change on adversarial examples?
- Is PGD more effective than FGSM?

In [None]:
# Colab setup
import sys
import os

if 'google.colab' in sys.modules:
    %cd /content
    !git clone https://github.com/cdm34/adversarial-robustness.git 2>/dev/null || true
    %cd adversarial-robustness
    sys.path.insert(0, '/content/adversarial-robustness')
else:
    sys.path.insert(0, os.path.abspath('..'))

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np

from src import (
    FashionMNISTNet,
    DataConfig, get_fashion_mnist_datasets, split_train_val, make_loaders,
    AttackConfig, fgsm, pgd_linf,
    accuracy, confidence_stats, attack_success_rate,
    get_device, set_seed,
    plot_adversarial_examples, plot_epsilon_vs_accuracy, save_figure,
    FASHION_MNIST_CLASSES,
)

print(f"PyTorch version: {torch.__version__}")

## 1. Setup & Load Model

In [None]:
set_seed(42)
device = get_device()
print(f"Using device: {device}")

# Load data
train_ds, test_ds = get_fashion_mnist_datasets()
data_cfg = DataConfig(batch_size=128, val_ratio=0.0)  # No validation needed here
train_subset, val_subset = split_train_val(train_ds, data_cfg.val_ratio)
train_loader, val_loader, test_loader = make_loaders(
    train_subset, val_subset, test_ds, data_cfg, device
)

print(f"Test batches: {len(test_loader)}")

In [None]:
# Load trained baseline model
model = FashionMNISTNet().to(device)

checkpoint_path = 'checkpoints/baseline_cnn.pt'
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded model with test accuracy: {checkpoint['test_accuracy']:.2f}%")
else:
    print("WARNING: No checkpoint found. Training a quick model...")
    from src import TrainConfig, fit
    train_ds, test_ds = get_fashion_mnist_datasets()
    train_subset, val_subset = split_train_val(train_ds, 0.1)
    train_loader, val_loader, test_loader = make_loaders(
        train_subset, val_subset, test_ds, DataConfig(batch_size=128), device
    )
    fit(model, train_loader, val_loader, device, TrainConfig(epochs=5))

model.eval()
clean_acc = accuracy(model, test_loader, device)
print(f"Clean accuracy: {clean_acc:.2f}%")

## 2. FGSM Attack Analysis

In [None]:
# Test FGSM at various epsilon values
epsilons = [0.0, 0.01, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3]
fgsm_accuracies = []

print("FGSM Attack Results:")
print("-" * 40)

for eps in epsilons:
    if eps == 0.0:
        acc = clean_acc
    else:
        cfg = AttackConfig(eps=eps)
        correct = 0
        total = 0
        
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            x_adv = fgsm(model, x, y, cfg)
            
            with torch.no_grad():
                preds = model(x_adv).argmax(dim=1)
                correct += (preds == y).sum().item()
                total += y.size(0)
        
        acc = 100.0 * correct / total
    
    fgsm_accuracies.append(acc)
    print(f"  ε = {eps:.2f}: {acc:.2f}%")

## 3. PGD Attack Analysis

In [None]:
# Test PGD at various epsilon values
pgd_accuracies = []

print("PGD Attack Results (10 steps):")
print("-" * 40)

for eps in epsilons:
    if eps == 0.0:
        acc = clean_acc
    else:
        cfg = AttackConfig(eps=eps, steps=10, step_size=eps/4)
        correct = 0
        total = 0
        
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            x_adv = pgd_linf(model, x, y, cfg)
            
            with torch.no_grad():
                preds = model(x_adv).argmax(dim=1)
                correct += (preds == y).sum().item()
                total += y.size(0)
        
        acc = 100.0 * correct / total
    
    pgd_accuracies.append(acc)
    print(f"  ε = {eps:.2f}: {acc:.2f}%")

In [None]:
# Plot accuracy vs epsilon
fig = plot_epsilon_vs_accuracy(epsilons, clean_acc, fgsm_accuracies, pgd_accuracies)
save_figure(fig, 'accuracy_vs_epsilon')
plt.show()

## 4. Visualize Adversarial Examples

In [None]:
# Get a batch for visualization
x_batch, y_batch = next(iter(test_loader))
x_batch, y_batch = x_batch[:10].to(device), y_batch[:10].to(device)

# Generate adversarial examples with FGSM
eps_vis = 0.1
cfg_vis = AttackConfig(eps=eps_vis)
x_adv = fgsm(model, x_batch, y_batch, cfg_vis)

# Get predictions
with torch.no_grad():
    preds_clean = model(x_batch).argmax(dim=1)
    preds_adv = model(x_adv).argmax(dim=1)

# Visualize
fig = plot_adversarial_examples(
    x_batch, x_adv, y_batch, preds_clean, preds_adv,
    num_samples=5, eps=eps_vis
)
save_figure(fig, 'fgsm_adversarial_examples')
plt.show()

In [None]:
# PGD adversarial examples
cfg_pgd = AttackConfig(eps=eps_vis, steps=10, step_size=eps_vis/4)
x_adv_pgd = pgd_linf(model, x_batch, y_batch, cfg_pgd)

with torch.no_grad():
    preds_adv_pgd = model(x_adv_pgd).argmax(dim=1)

fig = plot_adversarial_examples(
    x_batch, x_adv_pgd, y_batch, preds_clean, preds_adv_pgd,
    num_samples=5, eps=eps_vis
)
fig.suptitle('PGD Adversarial Examples', fontsize=12)
save_figure(fig, 'pgd_adversarial_examples')
plt.show()

## 5. Confidence Analysis (AI Safety)

In [None]:
# Compare confidence on clean vs adversarial examples
eps_test = 0.1
cfg_test = AttackConfig(eps=eps_test)

clean_confs = []
adv_confs = []

for x, y in test_loader:
    x, y = x.to(device), y.to(device)
    x_adv = fgsm(model, x, y, cfg_test)
    
    with torch.no_grad():
        clean_probs = torch.softmax(model(x), dim=1)
        adv_probs = torch.softmax(model(x_adv), dim=1)
        
        clean_confs.extend(clean_probs.max(dim=1)[0].cpu().tolist())
        adv_confs.extend(adv_probs.max(dim=1)[0].cpu().tolist())

print(f"Clean examples - Mean confidence: {np.mean(clean_confs):.3f}")
print(f"Adversarial (ε={eps_test}) - Mean confidence: {np.mean(adv_confs):.3f}")

In [None]:
# Confidence distribution comparison
fig, ax = plt.subplots(figsize=(10, 5))

ax.hist(clean_confs, bins=50, alpha=0.6, label='Clean', color='blue', density=True)
ax.hist(adv_confs, bins=50, alpha=0.6, label=f'Adversarial (ε={eps_test})', color='red', density=True)

ax.axvline(np.mean(clean_confs), color='blue', linestyle='--', linewidth=2)
ax.axvline(np.mean(adv_confs), color='red', linestyle='--', linewidth=2)

ax.set_xlabel('Confidence (max softmax probability)', fontsize=12)
ax.set_ylabel('Density', fontsize=12)
ax.set_title('Model Confidence: Clean vs. Adversarial Examples', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
save_figure(fig, 'confidence_distribution')
plt.show()

In [None]:
# Attack success rate
print("\nAttack Success Rates:")
print("-" * 40)

for eps in [0.05, 0.1, 0.15, 0.2]:
    cfg = AttackConfig(eps=eps)
    
    fgsm_result = attack_success_rate(model, test_loader, fgsm, cfg, device)
    pgd_result = attack_success_rate(model, test_loader, pgd_linf, 
                                      AttackConfig(eps=eps, steps=10, step_size=eps/4), device)
    
    print(f"ε = {eps:.2f}: FGSM = {fgsm_result['attack_success_rate']:.1f}%, "
          f"PGD = {pgd_result['attack_success_rate']:.1f}%")

## Summary

**Key Findings**:
1. Model accuracy degrades rapidly with increasing ε
2. PGD attacks are stronger than FGSM (as expected)
3. Model remains overconfident even on misclassified adversarial examples ⚠️
4. At ε=0.1, attack success rate is TBD

**AI Safety Implications**:
- High confidence on wrong predictions is dangerous in safety-critical applications
- Standard training provides no adversarial robustness

**Next**: Evaluate defenses (dropout, preprocessing, adversarial training)