In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Model
from keras.utils import to_categorical
from keras.layers import Input, Dense, Lambda, merge, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.models import Sequential, model_from_json, load_model
from keras.layers.core import Activation
from keras import datasets

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from numpy import clip
import pickle

import pandas as pd

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
# Utility Methods
def show(img):
    if len(img.shape) == 3:
        img = img.reshape(img.shape[0],img.shape[1])
    plt.tick_params(
        which='both',      
        left = False,
        right = False,
        bottom=False,      
        top=False,         
        labelleft=False,
        labelbottom=False) 
    plt.imshow(img,cmap='gray')
    
def Union(lst1, lst2): 
    final_list = list(set(lst1) | set(lst2)) 
    return final_list

# Configuration

In [3]:
# Datasets on which we conduct our evaluation
datasets = ['mnist', 'fmnist', 'cifar10']

# The explanation methods that will be used in the ensemble
exp_methods = ['lrp', 'guided_backprop', 'integrated_grad','pattern_attribution', 'grad_times_input']

# The directory of the detector models
# Format: 'data/defender/<dataset>/orig/train/<explation_method>/<target_class_index>/model/'
model_dir_structure = 'data/defender/{}/orig/train/{}/{}/model/'
model_name = 'exp_model.h5'

# The directory where we store the adversarial examples created using whitebox attack
# Format 'data/adv2/adversary/<dataset>/<target_exp_method>/<attack_method_to_test>/target_next/target_<class_idx>'
adv2_dir_structure = 'data/adv2/adversary/{}/{}/{}/target_next/target_{}/'
succ_examples_filename = 'succ_on_f.npy'

# The directory where we store the abnormal explanations created from adversarial examples which were inturn created using whitebox attack
adv2_exp_dir_structure = 'data/adversary/{}/adv2/{}/{}/{}/from_{}/'
explanations_filename = 'expls.npy'

src_exp_methods = exp_methods[:]
src_exp_methods.append('overall')

enable_detailed_logs = True


# Load the ExAD-CNN detector models

As discussed in Section 4.3.1 of our paper,  for every class in a dataset, we train 5 detector models- one corresponding to every explanation technique.

In [None]:
# Dictionary of models. 
# This avoids having to reload models multiple times, and therefore speeds up the analysis.
print('\n\tLoading detector models')
model_d = {}
for dataset in datasets:
    for class_idx in range(10):
        for exp_method in exp_methods:
            if enable_detailed_logs: print('Loading for dataset:{} target class:{} detector model for explanation technique:{}'.format(dataset,class_idx,exp_method))
            model_dir = model_dir_structure.format(dataset, exp_method, str(class_idx))
            model = load_model(model_dir + model_name)
            model_d[(dataset, class_idx, exp_method)] = model

In [4]:
df_list = list()

for dataset in datasets:
    print('Running whitebox attack evaluation on {} dataset'.format(dataset))
    
    # set the image dimension (side=H=W) based on the dataset
    if dataset in ['mnist', 'fmnist']:
        side = 28
    elif dataset in ['cifar10']:
        side = 32

    d = collections.defaultdict(list)
    total_adv_per_class_per_attack = 10
    
    table = []
    for target_exp_method in exp_methods: 
        print('\n\tTargeting {} explanation technique'.format(target_exp_method))
        row = []
        for class_idx in range(10):
            if class_idx == 0:
                src_class_index = 9
            else:
                src_class_index = class_idx-1
            
            if target_exp_method == 'integrated_grad':
                attack_methods_to_test = ['cwl2/conf_0']
            else:
                attack_methods_to_test = ['cwl2/conf_0', 'cwlinf/conf_0', 'cwl0/conf_0', 'bim', 'mim', 'jsma']

            for attack_method_to_test in attack_methods_to_test:
                # The directory of adv2 examples which fool both f(.) and the target_exp_method
                # we do this to ensure we compute the performance on successful adv2 examples only
                adv2_dir = adv2_dir_structure.format(dataset, target_exp_method, attack_method_to_test, str(class_idx))
                succ_on_f = np.load(adv2_dir + succ_examples_filename)
                retained_adv_len = len(np.where(succ_on_f == True)[0])

                # if say 8/10 adversarial examples were successful, then failed will have indices from 0 to 7
                failed = np.array([i for i in range(retained_adv_len)])

                for exp_method in exp_methods:
                    # Use the detector model for class_idx (target class) corresponding to exp_method (explanation technique)
                    model = model_d[(dataset, class_idx, exp_method)]
                    
                    adv2_exp_dir = adv2_exp_dir_structure.format(dataset, target_exp_method, attack_method_to_test, exp_method, str(src_class_index) )
                    adv2_exp = np.load(adv2_exp_dir + explanations_filename)
                    
                    # NOTE we retain successful adv2 examples
                    adv2_exp = adv2_exp[succ_on_f]
                    
                    # process images for classification
                    adv2_exp *= 255.0/np.max(adv2_exp)
                    adv2_exp = adv2_exp.astype(int)
                    adv2_exp = adv2_exp.reshape(-1,side,side,1)

                    #evaluate model on adv2_exp samples
                    result_test = model.predict(adv2_exp)

                    result_test_class = np.argmax(result_test, axis=1)
                    true_pos = len(np.where(result_test_class==1)[0])
                    total_pos = len(result_test_class)
                    det_rate = true_pos*100/total_pos

                    failed_cur_method = np.where(result_test_class==0)[0]
                    failed = np.intersect1d(failed, failed_cur_method)

                    d[(target_exp_method, exp_method)].append(det_rate)
                    d[(target_exp_method, attack_method_to_test, exp_method)].append(det_rate)

                true_pos_cumulative = total_adv_per_class_per_attack - len(failed) 
                det_rate_cumulative = true_pos_cumulative * 100 / total_adv_per_class_per_attack
                d[(target_exp_method, 'overall')].append(det_rate_cumulative)
                d[(target_exp_method, attack_method_to_test, 'overall')].append(det_rate_cumulative)

        
        for src_exp_method in src_exp_methods: 
            l = np.array(d[(target_exp_method, src_exp_method)])
            print('target_exp:{} src_exp:{} detection:{:.2f}'.format(target_exp_method, src_exp_method, np.mean(l)))
            mean_detection_rate = np.mean(l)
            row.append(mean_detection_rate)
        
        table.append(row)
    
    arr = np.array(table)
    df = pd.DataFrame(arr, index=exp_methods, columns=src_exp_methods)
    df_list.append(df)
    
    print('\n\bThis is the whitebox results for {}. These results should be nearly consistent with Figure 8 of the paper in Appendix.'.format(dataset))
    print('First (index) column shows the targeted explanaton technique. Columns 1-5 shows detection results by detector models corresponding to the explanation technique. Rightmost column shows overall detection results by ExAD.')
    print(df)
    


Running whitebox attack evaluation on mnist dataset

	Loading detector models


NameError: name 'collections' is not defined

In [None]:
for dataset_idx in len(df_list):
    print('\n\bThis is the whitebox results for {}. These results should be nearly consistent with Figure 8 of the paper in Appendix.'.format(datasets[dataset_idx]))
    print('First (index) column shows the targeted explanaton technique. Columns 1-5 shows detection results by detector models corresponding to the explanation technique. Rightmost column shows overall detection results by ExAD.')
    print(df_list[dataset_idx])
    
    
    