In [1]:
import tensorflow as tf

from tensorflow.keras.datasets import mnist, cifar10, cifar100
from tensorflow.keras.datasets import fashion_mnist

from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Activation

from sklearn.model_selection import train_test_split

import numpy as np
import random

import matplotlib.pyplot as plt

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

In [2]:
#load the data
(x, y), (x_test, y_test) = mnist.load_data()
#(x, y), (x_test, y_test) = fashion_mnist.load_data()

labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
img_rows, img_cols, channels = 28, 28, 1
num_classes = 10

#process the data
x, x_test = x / 255.0, x_test / 255.0

x = x.reshape((-1, img_rows, img_cols, channels))
x_test = x_test.reshape((-1, img_rows, img_cols, channels))

y = tf.keras.utils.to_categorical(y, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

#split training set, test set, attack set
x_train, x_attack, y_train, y_attack = train_test_split(x, y, test_size=0.1)

print("Data shapes:", x_test.shape, y_test.shape, x_train.shape, y_train.shape, x_attack.shape, y_attack.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
Data shapes: (10000, 28, 28, 1) (10000, 10) (54000, 28, 28, 1) (54000, 10) (6000, 28, 28, 1) (6000, 10)


In [3]:
#training model
def create_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(200))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
    return model

victim_model = create_model()
victim_model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data = (x_test, y_test))
victim_model.save('model/victim_model')
#victim_model.save('model/victim_model_fashion')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: model/victim_model_fashion\assets


In [4]:
#testing model
print("Base accuracy on regular images:", victim_model.evaluate(x=x_test, y=y_test, verbose=0))

Base accuracy on regular images: [0.01766335405409336, 0.8788999915122986]


In [5]:
#prediction API
def get_prediction(image):
    return victim_model.predict(image).argmax()

In [6]:
#Create attacker model architecture
def create_attack_model(model_arch):
    if model_arch == 5:
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(100))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 4:
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(64, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 3:
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), strides=(3, 3), padding='same', activation='relu', input_shape=(img_rows, img_cols, channels)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 2:
        model = Sequential()
        model.add(Flatten())
        model.add(Dense(200))
        model.add(Dense(num_classes, activation='softmax'))

    if model_arch == 1:
        model = Sequential()
        model.add(Flatten())
        model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
    return model

In [7]:
#creating adversarial example
def adversarial_pattern(image, label, model):
    image = tf.cast(image, tf.float32)
    
    with tf.GradientTape() as tape:
        tape.watch(image)
        prediction = model(image)
        loss = tf.keras.losses.MSE(label, prediction)
    
    gradient = tape.gradient(loss, image)
    
    signed_grad = tf.sign(gradient)
    
    return signed_grad

In [8]:
#Training substitute model
def train_attack_model(iteration, init_num, model_arch=4, eps=64/255.0):

    print("Attack start!")
    counter = 1

    #Collect initial data
    x_attack_init_ind = random.sample(range(0, x_attack.shape[0]), init_num)
    x_attack_init = x_attack[x_attack_init_ind]

    #Label initial data
    y_attack_init = []
    for image in x_attack_init:
        y_attack_init.append(get_prediction(image.reshape(1, img_rows, img_cols, channels)))

    y_attack_init = tf.one_hot(y_attack_init, 10)

    #select attack model architecture
    attack_model = create_attack_model(model_arch)
    attack_model.fit(x_attack_init, y_attack_init, batch_size=32, epochs=10, validation_data = (x_test, y_test))
    print("Finish inital model training!")
    print("Attack model accuracy: %f" % (attack_model.evaluate(x_test, y_test)[1]))
    print("Training substitute model start!")

    x_attack_set = x_attack_init
    y_attack_set = y_attack_init
    
    #start training
    while True:
        print("Iteration: %d" % (counter))
        print("creating synthetic example...")

        for i in range(x_attack_set.shape[0]):
            image = x_attack_set[i]
            image_label = y_attack_set[i]
            perturbations = adversarial_pattern(image.reshape((1, img_rows, img_cols, channels)), image_label, attack_model).numpy()
            adversarial = image + perturbations * eps
            x_attack_set = np.append(x_attack_set, adversarial, axis = 0)

        #Label new dataset
        print("Labeling new dataset...")
        y_attack_set = []
        for image in x_attack_set:
            y_attack_set.append(get_prediction(image.reshape(1, img_rows, img_cols, channels)))

        y_attack_set = tf.one_hot(y_attack_set, 10)

        #train the attack model with new dataset
        attack_model.fit(x_attack_set, y_attack_set, batch_size=32, epochs=10, validation_data = (x_test, y_test))

        #evluate the attack model
        print("Number of query: %d" % (x_attack_set.shape[0]))
        print("Attack model accuracy: %f" % (attack_model.evaluate(x_test, y_test)[1]))

        if counter == iteration:
            print("Training finish!")
            if model_arch == 4:
                model_path = 'model/attack_model/i'+str(iteration)+'s'+str(init_num)
                #model_path = 'model/attack_model_fashion/i'+str(iteration)+'s'+str(init_num)
            else:
                model_path = 'model/attack_model/mismatch_layer'+str(model_arch)
                #model_path = 'model/attack_model_fashion/mismatch_layer'+str(model_arch)
            attack_model.save(model_path)
            break

        counter += 1

In [10]:
#start attack: train_attack_model(iteration, init_num, model_arch, eps)
train_attack_model(3, 200)

Attack start!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Finish inital model training!
Attack model accuracy: 0.588500
Training substitute model start!
Iteration: 1
creating synthetic example...
Labeling new dataset...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Number of query: 400
Attack model accuracy: 0.692500
Iteration: 2
creating synthetic example...
Labeling new dataset...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Number of query: 800
Attack model accuracy: 0.690400
Iteration: 3
creating synthetic example...
Labeling new dataset...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Number of query: 1600
Attack model accuracy: 0.705200
Training finish!
INFO:tensorflow:Assets written to: model/attack_model_fashion/i3s200\