# Boilerplate

Package installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [30]:
# !pip install tensorboardX

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
# from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64 

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want. If you change it, mention the architectural details in your report.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200,10)

    def forward(self, x):
        x = x.view((-1, 28*28))
        x = F.relu(self.fc(x))
        x = self.fc2(x)
        return x

class Normalize(nn.Module):
    def forward(self, x):
        return (x - 0.1307)/0.3081

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = nn.Sequential(Normalize(), Net())

model = model.to(device)
model.train()

Sequential(
  (0): Normalize()
  (1): Net(
    (fc): Linear(in_features=784, out_features=200, bias=True)
    (fc2): Linear(in_features=200, out_features=10, bias=True)
  )
)

# Implement the Attacks

Functions are given a simple useful signature that you can start with. Feel free to extend the signature as you see fit.

You may find it useful to create a 'batched' version of PGD that you can use to create the adversarial attack.

In [3]:
# The last argument 'targeted' can be used to toggle between a targeted and untargeted attack.
def fgsm(model, x, y, eps):
    #TODO: implement this as an intermediate step of PGD
    # Notes: put the model in eval() mode for this function
    x_grad = x.clone().detach().requires_grad_(True)
    output = model(x_grad)
    loss = F.cross_entropy(output, y)
    model.zero_grad()
    loss.backward()
    sign_data_grad = x_grad.grad.data.sign()
    perturbed_image = x_grad + eps * sign_data_grad
    return perturbed_image

def pgd_untargeted(model, x, y, k, eps, eps_step):
    #TODO: implement this 
    # Notes: put the model in eval() mode for this function
    # x: input image
    # y: ground truth label for x
    # k: steps of FGSM
    # eps: projection region for PGD (note the need for normalization before projection, as eps values are for inputs in [0,1])
    # eps_step: step for one iteration of FGSM

    x_adv_out = x.clone().detach().requires_grad_(True)

    for i in range(k):
        perturbed_image = fgsm(model, x_adv_out, y, eps_step)
        perturbed_image = torch.clamp(perturbed_image - x, -eps, eps)
        x_adv_out = torch.clamp(x + perturbed_image, 0, 1)

    return x_adv_out
# The last argument 'targeted' can be used to toggle between a targeted and untargeted attack.


# Implement Adversarial Training

In [4]:
def train_model(model, train_loader, test_loader, num_epochs, enable_defense=False, attack='pgd', eps=0.1):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(num_epochs):
        model.train()
        curr_loss = 0.0
        
        for i, (image, label) in enumerate(train_loader):
            image, label = image.to(device), label.to(device)
            
            if enable_defense:
                if attack == 'fgsm':
                    image_adv = fgsm(model, image, label, eps)
                elif attack == 'pgd':
                    image_adv = pgd_untargeted(model, image, label, 10, eps, 0.01)
                
                outputs = model(image_adv)
            else:
                outputs = model(image)
            
            loss = F.cross_entropy(outputs, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            curr_loss += loss.item()
        
        print(f"Epoch {epoch} of {num_epochs}, Loss: {curr_loss/len(train_loader):.3f}")
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes) 

In [64]:
def test_model_on_attacks(model, test_loader, attack='pgd', eps=0.1):
    # TODO: implement this function to test the robust accuracy of the given model
    # use pgd_untargeted() within this function
    correct = 0
    adv_examples = []
    for j, (image, label) in enumerate(test_loader):
        image, label = image, label
        if attack == 'fgsm':
            adv_x = fgsm(model, image, label, eps)
        else:
            adv_x = pgd_untargeted(model, image, label, 10, eps, 0.01)        
        output = model(adv_x)
        _, pred = torch.max(output, 1)
        correct += (pred == label).sum().item()
    model.train()
    print('Accuracy = {}%'.format(float(correct) * 100 / 10000))
    

In [9]:
def test_std_acc(model, test_loader):
    correct = 0
    for j, (image, label) in enumerate(test_loader):
        image, label = image.to(device), label.to(device)
        output = model(image)
        _, pred = torch.max(output, 1)
        correct += (pred == label).sum().item()
    model.train()
    print('Accuracy = {}%'.format(float(correct) * 100 / 10000))

# Study Accuracy, Quality, etc.

Compare the various results and report your observations on the submission.

In [97]:
## train the original model
model.eval()
model = nn.Sequential(Normalize(), Net())
model = model.to(device)
model.train()

train_model(model, train_loader, test_loader, 20, False)
torch.save(model.state_dict(), 'weights.pt')

Epoch 0 of 20, Loss: 0.239
Epoch 1 of 20, Loss: 0.100
Epoch 2 of 20, Loss: 0.068
Epoch 3 of 20, Loss: 0.051
Epoch 4 of 20, Loss: 0.039
Epoch 5 of 20, Loss: 0.033
Epoch 6 of 20, Loss: 0.026
Epoch 7 of 20, Loss: 0.023
Epoch 8 of 20, Loss: 0.019
Epoch 9 of 20, Loss: 0.018
Epoch 10 of 20, Loss: 0.015
Epoch 11 of 20, Loss: 0.013
Epoch 12 of 20, Loss: 0.015
Epoch 13 of 20, Loss: 0.013
Epoch 14 of 20, Loss: 0.011
Epoch 15 of 20, Loss: 0.013
Epoch 16 of 20, Loss: 0.010
Epoch 17 of 20, Loss: 0.006
Epoch 18 of 20, Loss: 0.013
Epoch 19 of 20, Loss: 0.008


In [33]:
# standard accuracy

model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights.pt'))

test_std_acc(model, test_loader)

  model.load_state_dict(torch.load('weights.pt'))


Accuracy = 98.12%


In [101]:
## PGD based adversarial training
model = nn.Sequential(Normalize(), Net())
eps = 0.1
train_model(model, train_loader, test_loader, 20, True, 'pgd', eps)
torch.save(model.state_dict(), f'weights_AT_{eps}.pt')

Epoch 0 of 20, Loss: 0.770
Epoch 1 of 20, Loss: 0.475
Epoch 2 of 20, Loss: 0.417
Epoch 3 of 20, Loss: 0.387
Epoch 4 of 20, Loss: 0.365
Epoch 5 of 20, Loss: 0.353
Epoch 6 of 20, Loss: 0.343
Epoch 7 of 20, Loss: 0.333
Epoch 8 of 20, Loss: 0.330
Epoch 9 of 20, Loss: 0.323
Epoch 10 of 20, Loss: 0.316
Epoch 11 of 20, Loss: 0.314
Epoch 12 of 20, Loss: 0.307
Epoch 13 of 20, Loss: 0.307
Epoch 14 of 20, Loss: 0.302
Epoch 15 of 20, Loss: 0.300
Epoch 16 of 20, Loss: 0.297
Epoch 17 of 20, Loss: 0.294
Epoch 18 of 20, Loss: 0.291
Epoch 19 of 20, Loss: 0.291


In [103]:
## PGD attack on adversarially trained model
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights_AT_0.1.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    #model = model.load_state_dict(torch.load(f'weights_AT_{eps}.pt'))
    test_model_on_attacks(model, test_loader, attack='pgd', eps=eps)

  model.load_state_dict(torch.load('weights_AT_0.1.pt'))


Accuracy = 95.43%
Accuracy = 89.02%
Accuracy = 72.67%
Accuracy = 43.79%


In [65]:
## FGSM attack on adversarially trained model
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights_AT_0.1.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    #model = model.load_state_dict(torch.load(f'weights_AT_{eps}.pt'))
    test_model_on_attacks(model, test_loader, attack='fgsm', eps=eps)

  model.load_state_dict(torch.load('weights_AT_0.1.pt'))


Accuracy = 93.77%
Accuracy = 83.76%
Accuracy = 64.59%
Accuracy = 40.13%
