In [1]:
import numpy as np
import logging
import tensorflow as tf

from cleverhans.utils_mnist import data_mnist
from cleverhans.utils import to_categorical
from cleverhans.utils import set_log_level
from cleverhans.utils_tf import model_train, model_eval, batch_eval
from cleverhans.attacks import FastGradientMethod, SaliencyMapMethod
from cleverhans.attacks_tf import jacobian_graph, jacobian_augmentation

from cleverhans_tutorials.tutorial_models import make_basic_cnn, MLP
from cleverhans_tutorials.tutorial_models import Flatten, Linear, ReLU, Softmax

from ipywidgets import interact, fixed
from pprint import pprint
import matplotlib.pyplot as plt

In [2]:
??make_basic_cnn

In [3]:
def prep_attacked_model(sess, x, y, X_train, Y_train, X_test, Y_test,
              nb_epochs, batch_size, learning_rate,
              rng):
    # Define TF model graph (for the black-box model)
    model = make_basic_cnn()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    model_train(sess, x, y, predictions, X_train, Y_train, verbose=False,
                args=train_params, rng=rng)

    # Print out the accuracy on legitimate data
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, predictions, X_test, Y_test,
                          args=eval_params)
    print('Test accuracy of black-box on legitimate test '
          'examples: ' + str(accuracy))

    return model, predictions, accuracy

In [4]:
set_log_level(logging.DEBUG)
accuracies = {}
sess = tf.Session()

In [5]:
train_start=0
train_end=60000
test_start=0
test_end=10000
nb_classes=10
batch_size=128
learning_rate=0.001
nb_epochs=10
holdout=150
data_aug=6
nb_epochs_s=10
lmbda=0.1

In [6]:
X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                              train_end=train_end,
                                              test_start=test_start,
                                              test_end=test_end)

# Initialize substitute training set reserved for adversary
X_sub = X_test[:holdout]
Y_sub = np.argmax(Y_test[:holdout], axis=1)

# Redefine test set as remaining samples unavailable to adversaries
X_test = X_test[holdout:]
Y_test = Y_test[holdout:]

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting /tmp/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)


In [None]:
# label smoothing
label_smooth = .1
Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

In [None]:
# Define input and output TF placeholders
x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
y = tf.placeholder(tf.float32, shape=(None, 10))

# Seed random number generator so tutorial is reproducible
rng = np.random.RandomState([2017, 12, 13])


prep_attacked = prep_attacked_model(sess, x, y, X_train, Y_train, X_test, Y_test,
                          nb_epochs, batch_size, learning_rate,
                          rng=rng)
model, bbox_preds, accuracies['attacked'] = prep_attacked

Defined TensorFlow model graph.




# FGSM

In [None]:
fgsm = FastGradientMethod(model, sess=sess)

In [None]:
fgsm

In [None]:
fgsm_params = {'eps': 0.3,
               'clip_min': 0.,
               'clip_max': 1.}
adv_x_fgsm = fgsm.generate(x, **fgsm_params)

In [None]:
adv_x_fgsm

In [None]:
def pretty_print_list(l):
    pprint({idx: f'{x:.2f}' for idx, x in enumerate(l)})

In [None]:
def plot(idx, adv_x):
    adv = adv_x.eval({x: X_test[idx:idx+1]}, sess)
    results = model(x).eval({x: X_test[idx:idx+1]},session=sess)
    adv_results = model(x).eval({x: adv},session=sess)

    plt.imshow(X_test[idx, ..., 0], cmap='gray')
    plt.show()

    plt.imshow(adv[0, ..., 0], cmap='gray')
    plt.show()

    print("Targeted:", results.argmax(), "Adversarial:", adv_results.argmax())
    print("Targeted probabilities")
    pretty_print_list(results.reshape(-1).tolist())
    print("Adversarial probabilities")
    pretty_print_list(adv_results.reshape(-1).tolist())

In [None]:
interact(plot, idx=(0, len(X_test) - 1), adv_x=fixed(adv_x_fgsm))

# JSMA

In [None]:
def get_y_target(x=None):
    if x is None:
        return None
    else:
        one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
        one_hot_target[0, x] = 1
        return one_hot_target


In [None]:
jsma_params = {
    'theta': 1.,
    'gamma': 0.1,
    'clip_min': 0.,
    'clip_max': 1.,
    'y_target': get_y_target(5)
}


In [None]:
jsma = SaliencyMapMethod(model, sess=sess)
adv_x_jsma = jsma.generate(x, **jsma_params)

In [None]:
interact(plot, idx=(0, len(X_test) - 1), adv_x=fixed(adv_x_jsma))

# BLACK-BOX

In [None]:
def substitute_model(img_rows=28, img_cols=28, nb_classes=10):
    """
    Defines the model architecture to be used by the substitute. Use
    the example model interface.
    :param img_rows: number of rows in input
    :param img_cols: number of columns in input
    :param nb_classes: number of classes in output
    :return: tensorflow model
    """
    input_shape = (None, img_rows, img_cols, 1)

    # Define a fully connected model (it's different than the black-box)
    layers = [Flatten(),
              Linear(200),
              ReLU(),
              Linear(200),
              ReLU(),
              Linear(nb_classes),
              Softmax()]

    return MLP(layers, input_shape)

In [None]:
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes,
              nb_epochs_s, batch_size, learning_rate, data_aug, lmbda,
              rng):
    """
    This function creates the substitute by alternatively
    augmenting the training data and training the substitute.
    :param sess: TF session
    :param x: input TF placeholder
    :param y: output TF placeholder
    :param bbox_preds: output of black-box model predictions
    :param X_sub: initial substitute training data
    :param Y_sub: initial substitute training labels
    :param nb_classes: number of output classes
    :param nb_epochs_s: number of epochs to train substitute model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param data_aug: number of times substitute training data is augmented
    :param lmbda: lambda from arxiv.org/abs/1602.02697
    :param rng: numpy.random.RandomState instance
    :return:
    """
    # Define TF model graph (for the black-box model)
    model_sub = substitute_model()
    preds_sub = model_sub(x)
    print("Defined TensorFlow model graph for the substitute.")

    # Define the Jacobian symbolically using TensorFlow
    grads = jacobian_graph(preds_sub, x, nb_classes)

    # Train the substitute and augment dataset alternatively
    for rho in range(data_aug):
        print("Substitute training epoch #" + str(rho))
        train_params = {
            'nb_epochs': nb_epochs_s,
            'batch_size': batch_size,
            'learning_rate': learning_rate
        }
        model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub),
                    init_all=False, verbose=False, args=train_params,
                    rng=rng)

        # If we are not at last substitute training iteration, augment dataset
        if rho < data_aug - 1:
            print("Augmenting substitute training data.")
            # Perform the Jacobian augmentation
            X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda)

            print("Labeling substitute training data.")
            # Label the newly generated synthetic points using the black-box
            Y_sub = np.hstack([Y_sub, Y_sub])
            X_sub_prev = X_sub[int(len(X_sub)/2):]
            eval_params = {'batch_size': batch_size}
            bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev],
                                  args=eval_params)[0]
            # Note here that we take the argmax because the adversary
            # only has access to the label (not the probabilities) output
            # by the black-box model
            Y_sub[int(len(X_sub)/2):] = np.argmax(bbox_val, axis=1)

    return model_sub, preds_sub

In [None]:
train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub,
                          nb_classes, nb_epochs_s, batch_size,
                          learning_rate, data_aug, lmbda, rng=rng)
model_sub, preds_sub = train_sub_out

In [None]:
fgsm_bb = FastGradientMethod(model_sub, sess=sess)
fgsm_params_bb = {'eps': 0.6,
               'clip_min': 0.,
               'clip_max': 1.}
adv_x_fgsm_bb = fgsm_bb.generate(x, **fgsm_params)

In [None]:
interact(plot, idx=(0, len(X_test) - 1), adv_x=fixed(adv_x_fgsm_bb))

In [None]:
jsma_bb = SaliencyMapMethod(model_sub, sess=sess)
jsma_params_bb = {
    'theta': 1.,
    'gamma': 0.1,
    'clip_min': 0.,
    'clip_max': 1.,
    'y_target': get_y_target(0)
}

adv_x_jsma_bb = jsma_bb.generate(x, **jsma_params_bb)

In [None]:
interact(plot, idx=(0, len(X_test) - 1), adv_x=fixed(adv_x_jsma_bb))