In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.special import softmax
from easydict import EasyDict
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D

In [2]:
from cleverhans.tf2.attacks.projected_gradient_descent import projected_gradient_descent
from cleverhans.tf2.attacks.fast_gradient_method import fast_gradient_method
from cleverhans.tf2.utils import optimize_linear, compute_gradient

In [3]:
class Net(Model):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = Conv2D(64, 8, strides=(2, 2), activation="relu", padding="same")
        self.conv2 = Conv2D(128, 6, strides=(2, 2), activation="relu", padding="valid")
        self.conv3 = Conv2D(128, 5, strides=(1, 1), activation="relu", padding="valid")
        self.dropout = Dropout(0.25)
        self.flatten = Flatten()
        self.dense1 = Dense(128, activation="relu")
        self.dense2 = Dense(10)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.dense1(x)
        return self.dense2(x)

In [4]:
def ld_mnist():
    """Load training and test data."""

    def convert_types(image, label):
        image = tf.cast(image, tf.float32)
        image /= 255
        return image, label

    dataset, info = tfds.load(
        "mnist", data_dir="gs://tfds-data/datasets", with_info=True, as_supervised=True
    )
    mnist_train, mnist_test = dataset["train"], dataset["test"]
    mnist_train = mnist_train.map(convert_types).shuffle(10000).batch(128)
    mnist_test = mnist_test.map(convert_types).batch(128)
    return EasyDict(train=mnist_train, test=mnist_test)

In [5]:
data = ld_mnist()
model = Net()
loss_object = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.optimizers.Adam(learning_rate=0.001)

In [None]:
#two sets of loss and performance metrics for the two models 

In [38]:
train_loss1 = tf.metrics.Mean(name="train_loss")
test_acc_clean1 = tf.metrics.SparseCategoricalAccuracy()
test_acc_fgsm1 = tf.metrics.SparseCategoricalAccuracy()
test_acc_pgd1 = tf.metrics.SparseCategoricalAccuracy()
train_loss2 = tf.metrics.Mean(name="train_loss")
test_acc_clean2 = tf.metrics.SparseCategoricalAccuracy()
test_acc_fgsm2 = tf.metrics.SparseCategoricalAccuracy()
test_acc_pgd2 = tf.metrics.SparseCategoricalAccuracy()

In [39]:
def train_step(x, y):
      with tf.GradientTape() as tape:
           predictions = model(x)
           loss = loss_object(y, predictions)
      gradients = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model.trainable_variables))
      train_loss1(loss)

In [None]:
#Define a second model with identical architecture and its training function

In [8]:
model2 = Net()

In [40]:
def train2_step(x, y):
      with tf.GradientTape() as tape:
           predictions = model2(x)
           loss = loss_object(y, predictions)
      gradients = tape.gradient(loss, model2.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model2.trainable_variables))
      train_loss2(loss)

In [23]:
nb_epochs = 6
eps = 0.2
adv_train = False
loss_fn = tf.nn.sparse_softmax_cross_entropy_with_logits
targeted=False
norm = np.inf

In [None]:
#training of first model 

In [24]:
for epoch in range(nb_epochs):
        # keras like display of progress
        progress_bar_train = tf.keras.utils.Progbar(60000)
        for (x, y) in data.train:            
            train_step(x, y)
            progress_bar_train.add(x.shape[0], values=[("loss", train_loss.result())])



In [None]:
# check metrics for first model

In [41]:
progress_bar_test = tf.keras.utils.Progbar(10000)
for x, y in data.test:
    y_pred = model(x)
    test_acc_clean1(y, y_pred)

    x_fgm = fast_gradient_method(model, x, eps, np.inf)
    y_pred_fgm = model(x_fgm)
    test_acc_fgsm1(y, y_pred_fgm)

    #x_pgd = projected_gradient_descent(model, x,eps, 0.01, 40, np.inf)
    #y_pred_pgd = model(x_pgd)
    #test_acc_pgd(y, y_pred_pgd)

    progress_bar_test.add(x.shape[0])

print(
    "test acc on clean examples (%): {:.3f}".format(test_acc_clean1.result() * 100)
)
print(
    "test acc on FGM adversarial examples (%): {:.3f}".format(
        test_acc_fgsm1.result() * 100
    )
)


test acc on clean examples (%): 98.860
test acc on FGM adversarial examples (%): 19.020


In [None]:
# check metrics for second model 

In [42]:
progress_bar_test = tf.keras.utils.Progbar(10000)
for x, y in data.test:
    y_pred2 = model2(x)
    test_acc_clean2(y, y_pred2)

    x_fgm = fast_gradient_method(model2, x, eps, np.inf)
    y_pred_fgm2 = model2(x_fgm)
    test_acc_fgsm2(y, y_pred_fgm2)

    #x_pgd = projected_gradient_descent(model, x,eps, 0.01, 40, np.inf)
    #y_pred_pgd = model(x_pgd)
    #test_acc_pgd(y, y_pred_pgd)

    progress_bar_test.add(x.shape[0])

print(
    "test acc on clean examples (%): {:.3f}".format(test_acc_clean2.result() * 100)
)
print(
    "test acc on FGM adversarial examples (%): {:.3f}".format(
        test_acc_fgsm2.result() * 100
    )
)


test acc on clean examples (%): 8.830
test acc on FGM adversarial examples (%): 10.190


In [62]:
adv_train = True
nb_epochs = 2
eps = 0.2

In [None]:
# counter training
#train the second model(=model2) with adversarial data from the first model (=model)

In [67]:
for epoch in range(nb_epochs):
        # keras like display of progress
        progress_bar_train = tf.keras.utils.Progbar(60000)
        for (x2, y2) in data.train:
            if adv_train:
                # Replace clean example with adversarial example for adversarial training
                 x2 = fast_gradient_method(model, x2, eps, np.inf)
                 y_adv = np.argmax(model(x2),axis=1)
                 #print(y_adv.shape)
                 #x = projected_gradient_descent(model, x,eps, 0.01, 40, np.inf)
            train2_step(x2, y_adv)
            progress_bar_train.add(x2.shape[0], values=[("loss", train_loss2.result())])



In [None]:
#check on the first model again
# nothing should have changed after counter training

In [57]:
progress_bar_test = tf.keras.utils.Progbar(10000)
for x, y in data.test:
    y_pred = model(x)
    test_acc_clean1(y, y_pred)

    x_fgm = fast_gradient_method(model, x, eps, np.inf)
    y_pred_fgm = model(x_fgm)
    test_acc_fgsm1(y, y_pred_fgm)

    #x_pgd = projected_gradient_descent(model, x,eps, 0.01, 40, np.inf)
    #y_pred_pgd = model(x_pgd)
    #test_acc_pgd(y, y_pred_pgd)

    progress_bar_test.add(x.shape[0])

print(
    "test acc on clean examples (%): {:.3f}".format(test_acc_clean1.result() * 100)
)
print(
    "test acc on FGM adversarial examples (%): {:.3f}".format(
        test_acc_fgsm1.result() * 100
    )
)


test acc on clean examples (%): 98.860
test acc on FGM adversarial examples (%): 19.020


In [None]:
# first model after six epochs of training
# test acc on clean examples (%): 98.860
# test acc on FGM adversarial examples (%): 19.020

In [None]:
#now check on model 2

In [68]:
progress_bar_test = tf.keras.utils.Progbar(10000)
for x, y in data.test:
    y_pred = model2(x)
    test_acc_clean2(y, y_pred)

    x_fgm = fast_gradient_method(model2, x, eps, np.inf)
    y_pred_fgm = model2(x_fgm)
    test_acc_fgsm2(y, y_pred_fgm)

    #x_pgd = projected_gradient_descent(model, x,eps, 0.01, 40, np.inf)
    #y_pred_pgd = model(x_pgd)
    #test_acc_pgd(y, y_pred_pgd)

    progress_bar_test.add(x.shape[0])

print(
    "test acc on clean examples (%): {:.3f}".format(test_acc_clean2.result() * 100)
)
print(
    "test acc on FGM adversarial examples (%): {:.3f}".format(
        test_acc_fgsm2.result() * 100
    )
)


test acc on clean examples (%): 83.396
test acc on FGM adversarial examples (%): 7.331


In [None]:
#after one round of counter training
#test acc on clean examples (%): 51.555
#test acc on FGM adversarial examples (%): 6.970

#after three rounds of counter training
#test acc on clean examples (%): 66.373
#test acc on FGM adversarial examples (%): 5.690

# after five rounds of counter training 
#test acc on clean examples (%): 73.842
#test acc on FGM adversarial examples (%): 5.340

# after six rounds of counter training
# test acc on clean examples (%): 78.312
# test acc on FGM adversarial examples (%): 5.494

# after eight rounds of counter training
# test acc on clean examples (%): 81.265
# test acc on FGM adversarial examples (%): 6.242

# after ten rounds of counter training
# test acc on clean examples (%): 83.396
# test acc on FGM adversarial examples (%): 7.331