In [None]:
import tensorflow as tf

from tensorflow.keras.datasets import mnist, cifar10, cifar100
from tensorflow.keras.datasets import fashion_mnist

from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Activation

from sklearn.model_selection import train_test_split

import numpy as np
import random
from random import randrange

import matplotlib.pyplot as plt

In [None]:
#load the data
(x, y), (x_test, y_test) = mnist.load_data()
#(x, y), (x_test, y_test) = fashion_mnist.load_data()

labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
img_rows, img_cols, channels = 28, 28, 1
num_classes = 10

#process the data
x, x_test = x / 255.0, x_test / 255.0

x = x.reshape((-1, img_rows, img_cols, channels))
x_test = x_test.reshape((-1, img_rows, img_cols, channels))

y = tf.keras.utils.to_categorical(y, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

#split training set, test set, attack set
x_train, x_attack, y_train, y_attack = train_test_split(x, y, test_size=0.1, random_state=0)

print("Data shapes:", x_test.shape, y_test.shape, x_train.shape, y_train.shape, x_attack.shape, y_attack.shape)

In [None]:
#training model
def create_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(200))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
    return model

victim_model = create_model()
victim_model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data = (x_test, y_test))
victim_model.save('model/victim_model')
#victim_model.save('model/victim_model_fashion')


In [None]:
#testing model
print("Base accuracy on regular images:", victim_model.evaluate(x=x_test, y=y_test, verbose=0))

In [None]:
#Create attacker model architecture
def create_attack_model(model_arch):
    if model_arch == 5:
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(100))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 4:
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 3:
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 2:
        model = Sequential()
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 1:
        model = Sequential()
        model.add(Flatten())
        model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
    return model

In [None]:
#creating adversarial example
def adversarial_pattern(image, label, model):
    image = tf.cast(image, tf.float32)
    
    with tf.GradientTape() as tape:
        tape.watch(image)
        prediction = model(image)
        loss = tf.keras.losses.MSE(label, prediction)
    
    gradient = tape.gradient(loss, image)
    
    signed_grad = tf.sign(gradient)
    
    return signed_grad

In [None]:
#create detector

#split benign set, adv seed set
x_test_split, x_test_split2, y_test_split, y_test_split2 = train_test_split(x_test, y_test, test_size=0.5, random_state=0)

#load the victim model
victim_model = tf.keras.models.load_model('model/victim_model')

#label normal image (label = 0)
y_benign_test = np.zeros((x_test_split.shape[0], 1))

#label adv image (label = 1)
y_adv_test = np.ones((x_test_split2.shape[0], 1))

y_detector_test = np.concatenate((y_benign_test, y_adv_test), axis=0)
x_detector_test = x_test_split

#generate adversal data
eps=64/255.0
for i in range(x_test_split2.shape[0]):
    image = x_test_split2[i]
    image_label = y_test_split2[i]
    perturbations = adversarial_pattern(image.reshape((1, img_rows, img_cols, channels)), image_label, victim_model).numpy()
    adversarial = image + perturbations * eps
    x_detector_test = np.append(x_detector_test, adversarial, axis = 0)

print(x_detector_test.shape, y_detector_test.shape)

In [None]:
#generate adv dataset
budget = 1

#split benign set, adv seed set
x_benign, x_seed, y_benign, y_seed = train_test_split(x_train, y_train, test_size=0.05*budget, random_state=0)

#label normal image (label = 0)
y_benign = np.zeros((x_benign.shape[0], 1))

#label adv image (label = 1)
y_adv = np.ones((x_seed.shape[0], 1))

y_detector_train = np.concatenate((y_benign, y_adv), axis=0)
x_detector_train = x_benign

#generate adversal data
eps=64/255.0
for i in range(x_seed.shape[0]):
    image = x_seed[i]
    image_label = y_seed[i]
    perturbations = adversarial_pattern(image.reshape((1, img_rows, img_cols, channels)), image_label, victim_model).numpy()
    adversarial = image + perturbations * eps
    x_detector_train = np.append(x_detector_train, adversarial, axis = 0)

print(x_detector_train.shape, y_detector_train.shape, x_detector_test.shape, y_detector_test.shape)

In [None]:
#train detector

def create_detector_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(200))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
    return model

detector_model = create_detector_model()
detector_model.fit(x_detector_train, y_detector_train, batch_size=32, epochs=10, validation_data = (x_detector_test, y_detector_test))
detector_model.save('model/detector_model/b'+str(budget))

In [None]:
#apply detector
def get_prediction(image, detect):
    if detect and (detector_model.predict(image)[0][0]>0.5):
        return randrange(10)
    else:
        return victim_model.predict(image).argmax()


In [None]:
#Training substitute model
def train_attack_model(iteration, init_num, detector=False, model_arch=4, eps=0.25):

    print("Attack start!")
    counter = 1

    #Collect initial data
    x_attack_init_ind = random.sample(range(0, x_attack.shape[0]), init_num)
    x_attack_init = x_attack[x_attack_init_ind]

    #Label initial data
    y_attack_init = []
    for image in x_attack_init:
        y_attack_init.append(get_prediction(image.reshape(1, img_rows, img_cols, channels), detector))

    y_attack_init = tf.one_hot(y_attack_init, 10)

    #select attack model architecture
    attack_model = create_attack_model(model_arch)
    attack_model.fit(x_attack_init, y_attack_init, batch_size=32, epochs=10, validation_data = (x_test, y_test))
    print("Finish inital model training!")
    print("Attack model accuracy: %f" % (attack_model.evaluate(x_test, y_test)[1]))
    print("Training substitute model start!")

    x_attack_set = x_attack_init
    y_attack_set = y_attack_init
    
    #start training
    while True:
        print("Iteration: %d" % (counter))
        print("creating synthetic example...")

        for i in range(x_attack_set.shape[0]):
            image = x_attack_set[i]
            image_label = y_attack_set[i]
            perturbations = adversarial_pattern(image.reshape((1, img_rows, img_cols, channels)), image_label, attack_model).numpy()
            adversarial = image + perturbations * eps
            x_attack_set = np.append(x_attack_set, adversarial, axis = 0)

        #Label new dataset
        print("Labeling new dataset...")
        y_attack_set = []
        for image in x_attack_set:
            y_attack_set.append(get_prediction(image.reshape(1, img_rows, img_cols, channels), detector))

        y_attack_set = tf.one_hot(y_attack_set, 10)

        #train the attack model with new dataset
        attack_model.fit(x_attack_set, y_attack_set, batch_size=32, epochs=10, validation_data = (x_test, y_test))

        #evluate the attack model
        print("Number of query: %d" % (x_attack_set.shape[0]))
        print("Attack model accuracy: %f" % (attack_model.evaluate(x_test, y_test)[1]))

        if counter == iteration:
            print("Training finish!")
            if detector:
                model_path = 'model/attack_model/d'+str(budget)+'i'+str(iteration)+'s'+str(init_num)
            elif model_arch == 4:
                model_path = 'model/attack_model/i'+str(iteration)+'s'+str(init_num)+'c'+str(round(eps, 2))
                #model_path = 'model/attack_model_fashion/i'+str(iteration)+'s'+str(init_num)
            else:
                model_path = 'model/attack_model/mismatch_layer'+str(model_arch)
                #model_path = 'model/attack_model_fashion/mismatch_layer'+str(model_arch)
            attack_model.save(model_path)
            break

        counter += 1

In [None]:
#start attack: train_attack_model(iteration, init_num, detector, model_arch, eps)
train_attack_model(4, 400)