In [None]:
import numpy as np
##import tensorflow as tf
import tensorflow.compat.v1 as tf 
tf.compat.v1.disable_eager_execution()
import sys, os
sys.path.append("../")
import copy
import pandas as pd
from tensorflow.python.platform import flags
from scipy.optimize import basinhopping
import time
from adf_data.census import census_data
from adf_data.credit import credit_data
from adf_data.bank import bank_data
from adf_model.tutorial_models import dnn
from adf_utils.utils_tf import model_prediction, model_argmax
from adf_utils.config import census, credit, bank
from adf_tutorial.utils import cluster, gradient_graph

FLAGS = flags.FLAGS
#tot_labels=[]
# step size of perturbation
perturbation_size = 1

def check_for_error_condition(conf, sess, x, preds, t, sens1, sens2, sens3):
    global tot_labels
    global loc_samples
    
    """
    Check whether the test case is an individual discriminatory instance
    :param conf: the configuration of dataset
    :param sess: TF session
    :param x: input placeholder
    :param preds: the model's symbolic output
    :param t: test case
    :param sens: the index of sensitive feature
    :return: whether it is an individual discriminatory instance
    """
    
    t = [t.astype('int')]
    
    samples = m_instance( np.array(t), sens1, sens2, sens3 )
    pred = pred_prob(sess, x, preds, samples )
    partition = clustering(pred,samples, sens1, sens2, sens3)
    dis_sample =t.copy()
    dis_sample[0][sens1 - 1] = 0
    dis_sample[0][sens2 - 1] = 0
    dis_sample[0][sens3 - 1] = 0
    if tuple(dis_sample[0]) not in loc_samples:
            
        label = model_argmax(sess, x, preds, np.array(t))
        labels = pred_prob_train(sess, x, preds, samples)
        
        if (1 - label) in labels:
            tot_labels+=1
            loc_samples.add(tuple(dis_sample[0]))
   
    return max(list(partition.keys())[1:]) - min(list(partition.keys())[1:])#(len(partition) -1)
    
def seed_test_input(clusters, limit):
    """
    Select the seed inputs for fairness testing
    :param clusters: the results of K-means clustering
    :param limit: the size of seed inputs wanted
    :return: a sequence of seed inputs
    """
    i = 0
    rows = []
    max_size = max([len(c[0]) for c in clusters])
    while i < max_size:
        if len(rows) == limit:
            break
        for c in clusters:
            if i >= len(c[0]):
                continue
            row = c[0][i]
            rows.append(row)
            if len(rows) == limit:
                break
        i += 1
    return np.array(rows)

def clip(input, conf):
    """
    Clip the generating instance with each feature to make sure it is valid
    :param input: generating instance
    :param conf: the configuration of dataset
    :return: a valid generating instance
    """
    for i in range(len(input)):
        input[i] = max(input[i], conf.input_bounds[i][0])
        input[i] = min(input[i], conf.input_bounds[i][1])
    return input

class Local_Perturbation(object):
    """
    The  implementation of local perturbation
    """

    def __init__(self, sess, grad, x, n_value1, n_value2, n_value3, sens1, sens2, sens3, input_shape, conf):
        """
        Initial function of local perturbation
        :param sess: TF session
        :param grad: the gradient graph
        :param x: input placeholder
        :param n_value: the discriminatory value of sensitive feature
        :param sens_param: the index of sensitive feature
        :param input_shape: the shape of dataset
        :param conf: the configuration of dataset
        """
        self.sess = sess
        self.grad = grad
        self.x = x
        self.n_value1 = n_value1
        self.n_value2 = n_value2
        self.n_value3 = n_value3
        self.input_shape = input_shape
        self.sens1 = sens1
        self.sens2 = sens2
        self.sens3 = sens3
        self.conf = conf

    def __call__(self, x):
        """
        Local perturbation
        :param x: input instance for local perturbation
        :return: new potential individual discriminatory instance
        """

        # perturbation
        s = np.random.choice([1.0, -1.0]) * perturbation_size

        n_x = x.copy()
        n_x[self.sens1 - 1] = self.n_value1
        n_x[self.sens2 - 1] = self.n_value2
        n_x[self.sens3 - 1] = self.n_value3
        

        # compute the gradients of an individual discriminatory instance pairs
        ind_grad = self.sess.run(self.grad, feed_dict={self.x:np.array([x])})
        n_ind_grad = self.sess.run(self.grad, feed_dict={self.x:np.array([n_x])})

        if np.zeros(self.input_shape).tolist() == ind_grad[0].tolist() and np.zeros(self.input_shape).tolist() == \
                n_ind_grad[0].tolist():
            probs = 1.0 / (self.input_shape-1) * np.ones(self.input_shape)
            probs[self.sens1 - 1] = 0
            probs[self.sens2 - 1] = 0
            probs[self.sens3 - 1] = 0
        else:
            # nomalize the reciprocal of gradients (prefer the low impactful feature)
            grad_sum = 1.0 / (abs(ind_grad[0]) + abs(n_ind_grad[0]))
            grad_sum[self.sens1 - 1] = 0
            grad_sum[self.sens2 - 1] = 0
            grad_sum[self.sens3 - 1] = 0
            
            probs = grad_sum / np.sum(grad_sum)
        probs = probs/probs.sum()

        # randomly choose the feature for local perturbation
        index = np.random.choice(range(self.input_shape) , p=probs)
        local_cal_grad = np.zeros(self.input_shape)
        local_cal_grad[index] = 1.0

        x = clip(x + s * local_cal_grad, self.conf).astype("int")

        return x
    
    
    

    
#--------------------------------------
def m_instance(sample, sensitive_param1, sensitive_param2, sensitive_param3):
    #global m_sample
    #global index
    index = []
    m_sample = []
    
    for i in range(census.input_bounds[sensitive_param1 - 1][0], census.input_bounds[sensitive_param1 -1 ][1] + 1):
            for j in range(census.input_bounds[sensitive_param2 - 1][0], census.input_bounds[sensitive_param2 -1 ][1] + 1):
                for k in range(census.input_bounds[sensitive_param3 - 1][0], census.input_bounds[sensitive_param3 -1 ][1] + 1):
                    index.append([i,j,k])
                    
                    
                    
    
    
    for ind in range(len(index)):
        temp=sample.copy()
        temp[0][sensitive_param1-1]=index[ind][0]
        temp[0][sensitive_param2-1]=index[ind][1]
        temp[0][sensitive_param3-1]= index[ind][2]
        m_sample.append(temp)
    return np.array(m_sample)

def global_sample_select(clus_dic, sens1, sens2, sens3):
    leng = 0
    for key in clus_dic.keys():
        if key == 'Seed':
            continue
        if len(clus_dic[key]) > leng:
            leng = len(clus_dic[key])
            largest = key
    
    sample_ind = np.random.randint(len(clus_dic[largest]))
    n_sample_ind = np.random.randint(len(clus_dic[largest]))
    
    sample = clus_dic['Seed']
    sample[sens3 -1] = clus_dic[largest][sample_ind][0]
    sample[sens2 -1] = clus_dic[largest][sample_ind][1]
    sample[sens1 -1] = clus_dic[largest][sample_ind][2]
    
    # returns one sample of largest partition and its pair
    return np.array([sample]),clus_dic[largest][n_sample_ind]

def local_sample_select(clus_dic, sens1, sens2, sens3):
      
    k_1 = min(list(clus_dic.keys())[1:])
    k_2 = max(list(clus_dic.keys())[1:])
    
    sample_ind = np.random.randint(len(clus_dic[k_1]))
    n_sample_ind = np.random.randint(len(clus_dic[k_2]))

    sample = clus_dic['Seed']
    sample[sens3 -1] = clus_dic[k_1][sample_ind][0]
    sample[sens2 -1] = clus_dic[k_1][sample_ind][1]
    sample[sens1 -1] = clus_dic[k_1][sample_ind][2]


    return np.array([sample]),clus_dic[k_2][n_sample_ind]
    
def clustering(probs,m_sample, sens1, sens2, sens3):
    epsillon=0.025
    cluster_dic = {}
    cluster_dic['Seed']=m_sample[0][0]
    
        
    for i in range(len(probs)):
        #  to avoid k = 11
        if probs[i] == 1.0:
            if (int( probs[i] / epsillon ) -1) not in cluster_dic.keys():
                cluster_dic[ (int( probs[i] / epsillon ) -1)] = [ [m_sample[i][0][sens3 - 1]] + [ m_sample[i][0][sens2 - 1] ] + [ m_sample[i][0][sens1 - 1] ] ]
           
            else:
                cluster_dic[ (int( probs[i] / epsillon ) -1)].append( [m_sample[i][0][sens3 - 1]] + [ m_sample[i][0][sens2 - 1] ] + [ m_sample[i][0][sens1 - 1] ]  )

            
           
        elif int( probs[i] / epsillon ) not in cluster_dic.keys():
                cluster_dic[ int( probs[i] / epsillon )] = [ [m_sample[i][0][sens3 - 1]] + [ m_sample[i][0][sens2 - 1] ] + [ m_sample[i][0][sens1 - 1] ] ]
           
        else:
                cluster_dic[ int( probs[i] / epsillon)].append( [m_sample[i][0][sens3 - 1]] + [ m_sample[i][0][sens2 - 1] ] + [ m_sample[i][0][sens1 - 1] ]  )

    return cluster_dic  

def pred_prob_train(sess, x, preds, m_sample):
        
        labels= []
        for sample in m_sample:
            
            pred = model_prediction(sess, x, preds, np.array(sample))
            label= np.argmax(pred, axis=1)
            labels.append(list(label)[0])
            
            #print('probs',probs[0])
        return labels
    
def pred_prob(sess, x, preds, m_sample):
        global probs
        probs = []
        for sample in m_sample:
            
            pred = model_prediction(sess, x, preds, np.array(sample))[0][1]
            
            probs.append(pred)
            #print('probs',probs[0])
        return probs 
#-------------------------------------------
    
def dnn_fair_testing(dataset, sensitive_param, sensitive_param2, sensitive_param3, model_path, cluster_num, max_global, max_local, max_iter):
    """
    The implementation of ADF
    :param dataset: the name of testing dataset
    :param sensitive_param: the index of sensitive feature
    :param model_path: the path of testing model
    :param cluster_num: the number of clusters to form as well as the number of
            centroids to generate
    :param max_global: the maximum number of samples for global search
    :param max_local: the maximum number of samples for local search
    :param max_iter: the maximum iteration of global perturbation
    """
    data = {"census":census_data, "credit":credit_data, "bank":bank_data}
    data_config = {"census":census, "credit":credit, "bank":bank}
    # prepare the testing data and model
    X, Y, input_shape, nb_classes = data[dataset]()
    tf.set_random_seed(1234)
    
    #config = tf.ConfigProto()
    #config.gpu_options.per_process_gpu_memory_fraction = 0.8
    config = tf.ConfigProto(device_count = {'GPU': 0})
    config.allow_soft_placement= True
    
    sess = tf.Session(config=config)
    x = tf.placeholder(tf.float32, shape=input_shape)
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    model = dnn(input_shape, nb_classes)
    
    

    

    preds = model(x)
    saver = tf.train.Saver()
    model_path = model_path + dataset + "/test.model"
    saver.restore(sess, model_path)
    
    # construct the gradient graph
    grad_0 = gradient_graph(x, preds)

    # build the clustering model
    clf = cluster(dataset, cluster_num)
    clusters = [np.where(clf.labels_==i) for i in range(cluster_num)]

    # store the result of fairness testing
    tot_inputs = set()
    global_disc_inputs = set()
    global_disc_inputs_list = []
    local_disc_inputs = set()
    local_disc_inputs_list = []
    value_list = []
    global training_data
    global tot_labels
    global loc_samples
    training_data=set()
    loc_samples=set()
    tot_labels = 0
    suc_idx = []
    global training_df
    training_df= pd.DataFrame(columns=['sample','#K', 'max_num_elem', 'min_max_dist', 'initial K', 'label' ])
    #-----------------------
    def print_fun(x, f, accepted):

        print("at minimum %f accepted %d" % (-1 * f, int(accepted)))
    def evaluate_local(inp):
        
        """
        Evaluate whether the test input after local perturbation is an individual discriminatory instance
        :param inp: test input
        :return: whether it is an individual discriminatory instance
        """
        
        result = check_for_error_condition(data_config[dataset], sess, x, preds, inp, 
                                           sensitive_param, sensitive_param2, sensitive_param3)

        return (-1 * result)

    # select the seed input for fairness testing
    inputs = seed_test_input(clusters, min(max_global, len(X)))
    num_seed=0
    #time4=time.time()
    for num in range(len(inputs)):
        #time1=time.time()
        num_seed+=1
        print(num_seed)
        #if num_seed > 6: break
      
        index = inputs[num]
        global clus_dic
        global list_dic
        sample = X[index:index+1]       
        #---------------------------   
        m_sample = m_instance( sample, sensitive_param, sensitive_param2, sensitive_param3 )        
        pred = pred_prob(sess, x, preds, m_sample )             
        clus_dic = clustering( pred, m_sample, sensitive_param, sensitive_param2, sensitive_param3 )         
        init_k = len( clus_dic ) - 1     
        #print( 'ini K-->', init_k )           
        list_dic = []
        max_k = init_k
        max_k_iter = 30
        #----------------------------
        # start global perturbation
        for iter in range( max_iter + 1 ):
            #time1 = time.time()
            m_sample = m_instance( sample , sensitive_param, sensitive_param2, sensitive_param3 )          
            pred = pred_prob(sess, x, preds, m_sample )
            clus_dic = clustering( pred, m_sample, sensitive_param, sensitive_param2, sensitive_param3 )            
            cur_k = len( clus_dic ) - 1
            if len( clus_dic ) - 1 >= max_k:
                max_k = len( clus_dic ) - 1
                max_sample = sample.copy()

            sample,n_values = global_sample_select( clus_dic, sensitive_param, sensitive_param2, sensitive_param3 )

            if len(clus_dic)-1 >= 2:

                loc_x,n_values = local_sample_select(clus_dic ,sensitive_param, sensitive_param2, sensitive_param3 )                                
                n_value1 = n_values[2]
                n_value2 = n_values[1]
                n_value3 = n_values[0]
                           
                minimizer = {"method": "L-BFGS-B"}
                #minimizer = {"method":"L-BFGS-B", "jac":True}
                #minimizer = {"method": "SLSQP"}
                #minimizer = {'method':'nelder-mead'}
                local_perturbation = Local_Perturbation(sess, grad_0, x, n_value1, n_value2, n_value3, 
                                                        sensitive_param, sensitive_param2,
                                                        sensitive_param3, input_shape[1], 
                                                        data_config[dataset])
                
                basinhopping(evaluate_local, loc_x,stepsize=1.0, take_step=local_perturbation, 
                              minimizer_kwargs=minimizer, niter=max_local)#callback=print_fun, niter=max_local)

                
                dis_sample =sample.copy()
                dis_sample[0][sensitive_param  - 1] = 0
                dis_sample[0][sensitive_param2 - 1] = 0
                dis_sample[0][sensitive_param3 - 1] = 0
                if tuple(dis_sample[0].astype('int')) not in global_disc_inputs:

                    num_disc+= tot_labels  
                    min_max_dist = max(list(clus_dic.keys())[1:]) - min(list(clus_dic.keys())[1:])
                    max_num_elem = 0
                    for k in list(clus_dic.keys())[1:]:
                        if len(clus_dic[k]) > max_num_elem:
                            max_num_elem = len(clus_dic[k]) 
                    
                    global_disc_inputs.add(tuple(dis_sample[0].astype('int')))
                    training_df = training_df.append({'sample': tuple(dis_sample[0].astype('int')), 
                                                      '#K': len( clus_dic ) - 1, 
                                                      'min_max_dist':min_max_dist,
                                                      'max_num_elem':max_num_elem,
                                                      'initial K':init_k,
                                                      'label': num_disc}, ignore_index=True)
                
            
            num_disc = 0    
            tot_labels = 0
            list_dic.append(clus_dic)
            clus_dic={}

            if iter == max_iter:
                break
            if iter >= max_k_iter:
                if init_k>=max_k:
                    #print('break')
                    break
            if cur_k < max_k -5:
                print('Back')
                sample = max_sample.copy()
            
            #Making up n_sample
            n_sample = sample.copy()
            n_sample[0][sensitive_param3 - 1] = n_values[0]
            n_sample[0][sensitive_param2 - 1] = n_values[1]
            n_sample[0][sensitive_param  - 1] = n_values[2]
            
            
            # global perturbation
            s_grad = sess.run(tf.sign(grad_0), feed_dict={x: sample})
            n_grad = sess.run(tf.sign(grad_0), feed_dict={x: n_sample})
            # find the feature with same impact
            if np.zeros(data_config[dataset].params).tolist() == s_grad[0].tolist():
                g_diff = n_grad[0]
            elif np.zeros(data_config[dataset].params).tolist() == n_grad[0].tolist():
                g_diff = s_grad[0]
            else:
                g_diff = np.array(s_grad[0] == n_grad[0], dtype=float)
            g_diff[sensitive_param - 1] = 0
            g_diff[sensitive_param2 -1] = 0
            g_diff[sensitive_param3 -1] = 0


                
              
            cal_grad = s_grad * g_diff
            if np.zeros(input_shape[1]).tolist() == cal_grad.tolist()[0]:
                index  = np.random.randint(len(cal_grad[0]) - 1)
                index1 = np.random.randint(len(cal_grad[0]) - 1)
                index2 = np.random.randint(len(cal_grad[0]) - 1)
                
                if index == sensitive_param3 - 1:
                    index = index + 1
                if index == sensitive_param2 - 1:
                    index = index + 1
                if index == sensitive_param - 1:
                    index = index + 1
                
                if index1 == sensitive_param3 - 1:
                    index1 = index + 1
                if index1 == sensitive_param2 - 1:
                    index1 = index + 1
                if index1 == sensitive_param - 1:
                    index1 = index + 1
                    
                if index2 == sensitive_param3 - 1:
                    index2 = index + 1
                if index2 == sensitive_param2 - 1:
                    index2 = index + 1
                if index2 == sensitive_param - 1:
                    index2 = index + 1
                #print('s grad       ',s_grad)
                cal_grad[0][index]  = np.random.choice([1.0, -1.0])
                cal_grad[0][index1] = np.random.choice([1.0, -1.0])
                cal_grad[0][index2] = np.random.choice([1.0, -1.0])

            sample[0] = clip(sample[0] + perturbation_size * cal_grad[0], data_config[dataset]).astype("int")
            
            #time2= time.time()
            #print('Time ',time2 - time1)
    
      
          
        print('Num Instances ', len(training_df))
        max_k=0
        for k in list_dic:
            if len(k)-1 >max_k:
                max_k=len(k) -1 
        #print('max k-->',max_k,'\n')
        
        

    training_df.to_csv('Traingni Data.csv')   
        
    # create the folder for storing the fairness testing result
    if not os.path.exists('../results/'):
        os.makedirs('../results/')
    if not os.path.exists('../results/' + dataset + '/'):
        os.makedirs('../results/' + dataset + '/')
    if not os.path.exists('../results/'+ dataset + '/'+ str(sensitive_param) + '/'):
        os.makedirs('../results/' + dataset + '/'+ str(sensitive_param) + '/')

    # storing the fairness testing result
    np.save('../results/'+dataset+'/'+ str(sensitive_param) + '/suc_idx.npy', np.array(suc_idx))
    np.save('../results/'+dataset+'/'+ str(sensitive_param) + '/global_samples.npy', np.array(global_disc_inputs_list))
    np.save('../results/'+dataset+'/'+ str(sensitive_param) + '/local_samples.npy', np.array(local_disc_inputs_list))
    np.save('../results/'+dataset+'/'+ str(sensitive_param) + '/disc_value.npy', np.array(value_list))

    # print the overview information of result
    print("Total Inputs are " + str(len(tot_inputs)))
    print("Total discriminatory inputs of global search- " + str(len(global_disc_inputs)))
    print("Total discriminatory inputs of local search- " + str(len(local_disc_inputs)))

def main(argv=None):
    dnn_fair_testing(dataset = FLAGS.dataset,
                     sensitive_param = FLAGS.sens_param,
                     sensitive_param2 = FLAGS.sens_param2,
                     sensitive_param3 = FLAGS.sens_param3,
                     
  
                     model_path = FLAGS.model_path,
                     cluster_num=FLAGS.cluster_num,
                     max_global=FLAGS.max_global,
                     max_local=FLAGS.max_local,
                     max_iter = FLAGS.max_iter)

if __name__ == '__main__':
    flags.DEFINE_string("dataset", "census", "the name of dataset")
    flags.DEFINE_integer('sens_param', 9, 'sensitive index, index start from 1 for age, 9 for gender, 8 for race')
    flags.DEFINE_integer('sens_param2', 8, 'sensitive index, index start from 1 for age, 9 for gender, 8 for race')
    flags.DEFINE_integer('sens_param3', 1, 'sensitive index, index start from 1 for age, 9 for gender, 8 for race')
    flags.DEFINE_string('model_path', '../models/', 'the path for testing model')
    flags.DEFINE_integer('cluster_num', 4, 'the number of clusters to form as well as the number of centroids to generate')
    flags.DEFINE_integer('max_global', 1000, 'maximum number of samples for global search')#1000
    flags.DEFINE_integer('max_local', 1000, 'maximum number of samples for local search')#1000
    flags.DEFINE_integer('max_iter', 10, 'maximum iteration of global perturbation')

    tf.app.run()


INFO:tensorflow:Restoring parameters from ../models/census/test.model


I0602 12:17:56.216112 140693737744192 saver.py:1399] Restoring parameters from ../models/census/test.model


1
Num Instances  0
2
Back


In [None]:
len(loc_samples)

In [None]:
training_df

In [65]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import tree


X = []
Y = []
i = 0

with open("../datasets/census", "r") as ins:
    for line in ins:
        line = line.strip()
        line1 = line.split(',')
        if (i == 0):
            i += 1
            continue
        # L = map(int, line1[:-1])
        L = [int(i) for i in line1[:-1]]
        X.append(L)
        if int(line1[-1]) == 0:
            Y.append([0])
        else:
            Y.append([1])
X = np.array(X, dtype=float)
Y = np.array(Y, dtype=float)
model = tree.DecisionTreeClassifier()
#model =  LogisticRegression(random_state =1)
model.fit(X,Y)
model.predict(X)
tree.plot_tree(model)

array([0., 0., 0., ..., 0., 0., 1.])

In [193]:


from multiprocessing import Process, Queue, Event
from queue import Empty
from threading import Thread
import time
import pandas as pd
import random
import tempfile


fake_runtimes = [5,2,4,1,5]
# Simulate a long running process with a runtime that is variable
def long_running_process(process_id: int, task_time: int):
    time.sleep(task_time)
    print("Process id {} - Complete - Task time: {}s".format(process_id, task_time))






Process id 0 - Complete - Task time: 5s
Process id 1 - Complete - Task time: 2s
Process id 2 - Complete - Task time: 4s
Process id 3 - Complete - Task time: 1s
Process id 4 - Complete - Task time: 5s


In [198]:
%%timeit -r1 -n1
# Running this job 5 times in serial
for i, task_time in zip(range(5), fake_runtimes):
    long_running_process(i, task_time)

Process id 0 - Complete - Task time: 5s
Process id 1 - Complete - Task time: 2s
Process id 2 - Complete - Task time: 4s
Process id 3 - Complete - Task time: 1s
Process id 4 - Complete - Task time: 5s
17 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [199]:
%%timeit -r1 -n1
# Use multiple processes to do all 5 concurently
processes = []
for i, task_time in zip(range(5), fake_runtimes):
    p = Process(target=long_running_process, args=(i, task_time))
    p.start()
    processes.append(p)

for p in processes:
    p.join()

Process id 3 - Complete - Task time: 1s
Process id 1 - Complete - Task time: 2s
Process id 2 - Complete - Task time: 4s
Process id 0 - Complete - Task time: 5s
Process id 4 - Complete - Task time: 5s
5.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
