# This notebook demonstrates the use of Naive bayes calssification on k-anonymous de-identified data

In [1]:
from statistics import stdev, mean
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline

# Load all necessary packages
import sys
import time
sys.path.append("../")
import numpy as np
from tqdm import tqdm
import pandas as pd

from aif360.datasets import BinaryLabelDataset
from aif360.datasets.artificial_all import Artificial_all

from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric

from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing.optim_preproc_helpers.preproc_artificial_all import \
    load_preproc_artificial_all_deid, load_preproc_artificial_all_og, load_preproc_data_german_deid4, load_preproc_data_german_full
from aif360.algorithms.preprocessing.adult_data_deid_debias import *


from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

import diffprivlib.models as dp


from IPython.display import Markdown, display
import matplotlib.pyplot as plt

from common_utils import compute_metrics

# import importlib
# pm_utils = importlib.import_module('/home/andrew/python-virtual-environments/de-id-demo-v1-master/privacy_model_utils2')
# importlib.reload(pm_utils)

In [2]:
biased_class = 'Gender'
privileged_groups = [{biased_class: 1}]
unprivileged_groups = [{biased_class: 0}]
target = 'Income Binary'
numeric = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
file_loc = '/home/andrew/python-virtual-environments/AIF360/aif360/data/raw/adult-og/subsample/'
file_name = 'adult_combined_subsample.data'

og_german = load_preproc_artificial_all_og(file_name, file_loc, protected_attributes = [biased_class])
og_german_df = og_german.convert_to_dataframe()[0]
og_german_df = scale_it(og_german_df, numeric)
og_X_train, og_X_test, og_y_train, og_y_test = split_it(og_german_df, target)
og_german_df


Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week,native_country,Ethnicity,Gender,workclass=Federal-gov,workclass=Local-gov,...,occupation=Sales,occupation=Tech-support,occupation=Transport-moving,relationship=Husband,relationship=Not-in-family,relationship=Other-relative,relationship=Own-child,relationship=Unmarried,relationship=Wife,Income Binary
0,-0.588572,-0.444277,-0.153887,-0.228119,-0.334171,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-1.649950,-1.626914,-0.153887,-0.228119,-1.733756,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.498325,-2.415338,-0.153887,3.692169,-0.745814,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.245368,1.132572,-0.153887,-0.228119,-0.087186,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.119261,1.132572,-0.153887,-0.228119,0.736099,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.057883,1.132572,0.732897,-0.228119,0.324457,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1996,-0.436947,1.132572,1.579079,-0.228119,-0.087186,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1997,-0.133696,0.738359,-0.153887,-0.228119,0.736099,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1998,-0.816010,-0.444277,0.178888,-0.228119,-0.087186,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
file_loc = '/home/andrew/python-virtual-environments/AIF360/aif360/data/raw/adult-og/'
comp_orig = True
test_type = 'test_on_DEID'
biased_class = 'Gender'
privileged_groups = [{biased_class: 1}]
unprivileged_groups = [{biased_class: 0}]

numeric = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

deid_list = ['og', 'deid4', 'deid8', 'deid16']

indexer = []
deidr_list = []

accu_list = []
maj_acc_list = []
min_acc_list = []
min_chg_pos_outcome_list = []
maj_chg_pos_outcome_list = []
disp_imp_list = []
num_pos_outcome = []

accu_list_std_dev = []
maj_acc_list_std_dev = []
min_acc_list_std_dev = []
min_chg_pos_outcome_list_std_dev = []
maj_chg_pos_outcome_list_std_dev = []
disp_imp_list_std_dev = []
num_pos_outcome_std_dev = []

all_datasets = {}


tic = time.clock()

for deid in deid_list:
    file_name = 'adult_' + deid
    indexer.append(file_name)
    deidr_list.append(deid)
    origin = file_loc + file_name
    
    if deid == 'og':      
        all_datasets[file_name] = {}
        all_datasets[file_name]['base_data'] = load_preproc_artificial_all_og(file_name, file_loc, protected_attributes = [biased_class])
    else:
        all_datasets[file_name] = {}
        all_datasets[file_name]['base_data'] = load_preproc_artificial_all_deid(file_name, file_loc, protected_attributes = [biased_class])


    df = all_datasets[file_name]['base_data'].convert_to_dataframe()[0]

    
    if comp_orig:
        test_type = 'test_on_ORIG'
    validation = file_loc + 'validation/' + file_name + '_df_VALIDATION_' + test_type
    #df.to_csv(validation, index=False)

    df = scale_it(df, numeric)

    validation = file_loc + 'validation/' + file_name + '_df_VALIDATION+SCALED_' + test_type
    #df.to_csv(validation, index=False)
    
    accuracy = []
    accuracy_std_dev = []
    disparate = []
    disparate_std_dev = []
    min_acc = []
    min_acc_std_dev = []
    maj_acc = []
    maj_acc_std_dev = []
    min_change = []
    min_change_std_dev = []
    maj_change = []
    maj_change_std_dev = []
    num_pos = []
    num_pos_std_dev = []

    test = 10
    for i in range(test):
        X_train, X_test, y_train, y_test = split_it(df, target)
        if comp_orig:
            X_test = og_X_test
            y_test = og_y_test
            test_type = 'test_on_ORIG'
        
        if i == 0:
            df_test = X_test.copy()
            df_test['Income Binary'] = y_test.values
            test_set = file_loc + 'test/' + file_name + '_df_TEST_'
            #df_test.to_csv(test_set, index=False)
            
            df_train = X_train.copy()
            df_train['Income Binary'] = y_train.values
            train_set = file_loc + 'test/' + file_name + '_df_TRAIN_'
            #df_train.to_csv(train_set, index=False)
            #print(X_test)
        num_pos_og, accuracy_test, accuracy_train, acc_min_test, acc_maj_test, min_chg_pos_outcome, maj_chg_pos_outcome, disp_imp_test = gaussian_nb(X_train, X_test, y_train, y_test, protected_att=biased_class)

        accuracy.append(accuracy_test)
        disparate.append(disp_imp_test)
        min_acc.append(acc_min_test)
        maj_acc.append(acc_maj_test)
        min_change.append(min_chg_pos_outcome)
        maj_change.append(maj_chg_pos_outcome)
        num_pos.append(num_pos_og)


    accu_list.append(mean(accuracy))
    maj_acc_list.append(mean(min_acc))
    min_acc_list.append(mean(maj_acc))
    min_chg_pos_outcome_list.append(mean(min_change))
    maj_chg_pos_outcome_list.append(mean(maj_change))
    disp_imp_list.append(mean(disparate))
    num_pos_outcome.append(mean(num_pos))

    accu_list_std_dev.append(stdev(accuracy))
    maj_acc_list_std_dev.append(stdev(min_acc))
    min_acc_list_std_dev.append(stdev(maj_acc))
    min_chg_pos_outcome_list_std_dev.append(stdev(min_change))
    maj_chg_pos_outcome_list_std_dev.append(stdev(maj_change))
    disp_imp_list_std_dev.append(stdev(disparate))
    num_pos_outcome_std_dev.append(stdev(num_pos))
            
toc = time.clock()            
tic_toc = toc-tic       
            

matrix = pd.DataFrame({'dataset': indexer})
matrix['deid'] = deidr_list   
matrix['accu'] = accu_list
matrix['maj_accu'] = maj_acc_list
matrix['min_accu'] = min_acc_list
matrix['min_chg_pos_outcome'] = min_chg_pos_outcome_list
matrix['maj_chg_pos_outcome'] = maj_chg_pos_outcome_list
matrix['disp_imp'] = disp_imp_list
matrix['num_pos_outcome'] = num_pos_outcome

matrix['accu_std_dev'] = accu_list_std_dev
matrix['maj_accu_std_dev'] = maj_acc_list_std_dev
matrix['min_accu_std_dev'] = min_acc_list_std_dev
matrix['min_chg_pos_outcome_std_dev'] = min_chg_pos_outcome_list_std_dev
matrix['maj_chg_pos_outcome_std_dev'] = maj_chg_pos_outcome_list_std_dev
matrix['disp_imp_std_dev'] = disp_imp_list_std_dev
matrix['num_pos_outcome_std_dev'] = num_pos_outcome_std_dev

file_dest = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/'
file_name = file_dest + 'adult_data_combined' + str(tic_toc)
#matrix.to_csv(file_name, index=False)
print(f'COMPLETE IN: {tic_toc}')

ValueError: operands could not be broadcast together with shapes (600,57) (58,) 

In [10]:
file_dest = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison (copy)/Outputs/'
file_name = file_dest + 'all_data_matrix_exclude_protected_imb_dif_GAUSIAN_NB_217.06204700000004'
matrix = pd.read_csv(file_name)
matrix

Unnamed: 0,dataset,imbr,maj_pos,min_pos,deid,accu,maj_accu,min_accu,min_chg_pos_outcome,maj_chg_pos_outcome,disp_imp,num_pos_outcome
0,1_50_40_og,1,50,40,og,0.697917,0.689223,0.706567,1.617464,1.495784,0.872739,481
1,1_50_40_deid4,1,50,40,deid4,0.646250,0.675021,0.617623,1.856549,1.554806,0.963716,481
2,1_50_40_deid8,1,50,40,deid8,0.727083,0.723475,0.730673,1.444906,1.311973,0.888861,481
3,1_50_40_deid16,1,50,40,deid16,0.738333,0.714286,0.762261,1.340956,1.347386,0.803233,481
4,1_50_20_og,1,50,20,og,0.626250,0.669173,0.583541,2.946281,1.576728,0.758764,242
...,...,...,...,...,...,...,...,...,...,...,...,...
415,100_10_1_deid16,100,10,1,deid16,0.280000,0.271308,0.966667,0.000000,8.053279,0.000000,1
416,100_5_1_og,100,5,1,og,0.376667,0.369198,0.966667,0.000000,12.105263,0.000000,1
417,100_5_1_deid4,100,5,1,deid4,0.332083,0.324051,0.966667,0.000000,12.984962,0.000000,1
418,100_5_1_deid8,100,5,1,deid8,0.345833,0.337975,0.966667,0.000000,12.766917,0.000000,1


In [11]:
df_imb_deid = matrix.copy()
accu_og_min = 0
accu_og_maj = 0
def accu_min_loss(x):
    global accu_og_min
    if x['deid'] == 'og' or x['deid'] == 10.0:
        accu_og_min = x['min_accu']
    cats = accu_og_min - x['min_accu'] 
    return cats

def accu_maj_loss(x):
    global accu_og_maj
    if x['deid'] == 'og' or x['deid'] == 10.0:
        accu_og_maj = x['maj_accu']
    cats = accu_og_maj - x['maj_accu']
    return cats
    
df_imb_deid['accu_maj_loss'] = df_imb_deid.apply(lambda x: accu_maj_loss(x), axis=1)
df_imb_deid['accu_min_loss'] = df_imb_deid.apply(lambda x: accu_min_loss(x), axis=1)
matrix = df_imb_deid.copy()
matrix

Unnamed: 0,dataset,imbr,maj_pos,min_pos,deid,accu,maj_accu,min_accu,min_chg_pos_outcome,maj_chg_pos_outcome,disp_imp,num_pos_outcome,accu_maj_loss,accu_min_loss
0,1_50_40_og,1,50,40,og,0.697917,0.689223,0.706567,1.617464,1.495784,0.872739,481,0.000000,0.000000
1,1_50_40_deid4,1,50,40,deid4,0.646250,0.675021,0.617623,1.856549,1.554806,0.963716,481,0.014202,0.088944
2,1_50_40_deid8,1,50,40,deid8,0.727083,0.723475,0.730673,1.444906,1.311973,0.888861,481,-0.034252,-0.024106
3,1_50_40_deid16,1,50,40,deid16,0.738333,0.714286,0.762261,1.340956,1.347386,0.803233,481,-0.025063,-0.055694
4,1_50_20_og,1,50,20,og,0.626250,0.669173,0.583541,2.946281,1.576728,0.758764,242,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,100_10_1_deid16,100,10,1,deid16,0.280000,0.271308,0.966667,0.000000,8.053279,0.000000,1,0.135865,0.000000
416,100_5_1_og,100,5,1,og,0.376667,0.369198,0.966667,0.000000,12.105263,0.000000,1,0.000000,0.000000
417,100_5_1_deid4,100,5,1,deid4,0.332083,0.324051,0.966667,0.000000,12.984962,0.000000,1,0.045148,0.000000
418,100_5_1_deid8,100,5,1,deid8,0.345833,0.337975,0.966667,0.000000,12.766917,0.000000,1,0.031224,0.000000


In [12]:
file_loc = '/home/andrew/python-virtual-environments/AIF360/aif360/data/raw/adult-og/output/'
file_name = 'adult_table_' + test_type
dest = file_loc+file_name
matrix['De-identification Level'] = ['None', 'k=4', 'k=8', 'k=16']

matrix.set_index('De-identification Level', inplace=True)
new_df = matrix.filter(['accu', 'accu_std_dev','disp_imp', 'maj_chg_pos_outcome', 'min_chg_pos_outcome', \
                       'accu_maj_loss', 'accu_min_loss'], axis=1)
new_df.columns = ['Accuracy', 'Accuracy StdDev', "Disparate Impact", 'Fairness Loss Priv', \
                  'Fairness Loss Unpriv', 'Utility Loss Priv.', 'Utility Loss Unpriv']
#new_df.to_csv(dest)
new_df

ValueError: Length of values (4) does not match length of index (420)

In [None]:
sdfasdf

In [13]:
matrix['De-identification Level'] = ['None', 'k=4', 'k=8', 'k=16']
matrix.set_index('De-identification Level', inplace=True)
new_df = matrix.filter(['maj_chg_pos_outcome','min_chg_pos_outcome'], axis=1)
new_df.columns = ['Majority Positive Loss', "Minority Positive Loss"]
new_df.plot(kind='bar')

#plt.savefig('/home/andrew/python-virtual-environments/AIF360/aif360/data/raw/adult-og/adult_pos_loss_percentage_TEST_ON_ORIG_dp.png')
plt.show()

ValueError: Length of values (4) does not match length of index (420)

In [None]:
matrix['De-identification Level'] = ['None','k=4', 'k=8', 'k=16']
matrix.set_index('De-identification Level', inplace=True)
new_df = matrix.filter(['accu_maj_loss','accu_min_loss'], axis=1)
new_df.columns = ['Utlity Loss Majority', "Utility Loss Minority"]
new_df.plot(kind='bar')

#plt.savefig('/home/andrew/python-virtual-environments/AIF360/aif360/data/raw/adult-og/adult_utility_loss_TEST_ON_ORIG_dp.png')
plt.show()

In [None]:
df_log_reg = pd.read_csv(r'/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/all_data_matrix_exclude_protected_imb_dif_LOGREG_1657.62894')
df_gauss_NB = pd.read_csv(r'/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/all_data_matrix_exclude_protected_imb_dif_GAUSIAN_NB_311.2461709999999')
df_dp_NB = pd.read_csv(r'/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/all_data_matrix_exclude_protected_imb_dif_dif_priv_GAUSIAN_NB_712.423939')
df_dp_lr = pd.read_csv(r'/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/all_data_matrix_exclude_protected_imb_dif_DIF_PRIV_LOG_REG_936.4032239999999')

df_log_reg.head()

In [None]:
data_type = 'Artifical'
classifier = 'DP_Gauss_NB'
param = 'maj_pos'

if param == 'imbr':
    title = 'Imbalance Ratio'
if param == 'maj_pos':
    title = 'Majority Positive Percentage'
if param == 'min_pos':
    title = 'Minority Positive Percentage'
    
df_og = df_dp_NB
df_og['accu_maj_loss'] = [0] * len(df_og)
df_og['accu_min_loss'] = [0] * len(df_og)

imb_list = ['1', '2', '5', '10', '20', '50', '100']
dif_list = ['50_40', '50_20', '50_10', '50_5', '50_1', '40_20', '40_10', '40_5', '40_1', '20_10', '20_5', '20_1', '10_5', '10_1', '5_1']
deid_list = ['og', 'deid4', 'deid8', 'deid16']
epsilons = [0.1, 1.0, 5.0, 10.0]
dif_vals = [50, 40, 20, 10, 5, 1]

df_imb_deid = df_og.loc[:0]
#print(df_imb_deid)

for x in dif_vals:
    for deid in epsilons:
        df_imb1_og = df_og[(df_og[param] == int(x)) & (df_og['deid'] == deid)]
        df_imb1_og = df_imb1_og.reset_index()
        del df_imb1_og['index']

        temp_df = df_imb1_og.loc[:0]
        
        for column in df_imb1_og.columns:
            if column not in ['deid', 'dataset']:
                temp_df[column] = df_imb1_og[column].mean()
                
        df_imb_deid = pd.concat([df_imb_deid, temp_df], ignore_index=True, sort=False)


df_imb_deid = df_imb_deid.drop(0)

accu_og_min = 0
accu_og_maj = 0
def accu_min_loss(x):
    global accu_og_min
    if x['deid'] == 'og' or x['deid'] == 10.0:
        accu_og_min = x['min_accu']
    cats = x['min_accu'] - accu_og_min
    return cats

def accu_maj_loss(x):
    global accu_og_maj
    if x['deid'] == 'og' or x['deid'] == 10.0:
        accu_og_maj = x['maj_accu']
    cats = x['maj_accu'] - accu_og_maj
    return cats
    
df_imb_deid['accu_maj_loss'] = df_imb_deid.apply(lambda x: accu_maj_loss(x), axis=1)
df_imb_deid['accu_min_loss'] = df_imb_deid.apply(lambda x: accu_min_loss(x), axis=1)


df_imb_deid

In [None]:
# og =df_imb_deid[df_imb_deid['deid'] == 'og']
# deid4 =df_imb_deid[df_imb_deid['deid'] == 'deid4']
# deid8 =df_imb_deid[df_imb_deid['deid'] == 'deid8']
# deid16 =df_imb_deid[df_imb_deid['deid'] == 'deid16']

og =df_imb_deid[df_imb_deid['deid'] == 10.0]
deid4 =df_imb_deid[df_imb_deid['deid'] == 5.0]
deid8 =df_imb_deid[df_imb_deid['deid'] == 1.0]
deid16 =df_imb_deid[df_imb_deid['deid'] == 0.1]

In [None]:
fig, ax1 = plt.subplots(figsize=(10,7))
ax1.set_xlabel(title, fontsize=16, fontweight='bold')
ax1.set_ylabel('Accuracy', color='k', fontsize=16, fontweight='bold')

ax1.plot(og[param], og['accu'], color='r', marker='o')
ax1.plot(deid4[param], deid4['accu'], color='y', marker='o')
ax1.plot(deid8[param], deid8['accu'], color='g', marker='o')
ax1.plot(deid16[param], deid16['accu'], color='b', marker='o')

ax2 = ax1.twinx()
ax2.plot(og[param], og['disp_imp'], '--', color='r', marker='|')
ax2.plot(deid4[param], deid4['disp_imp'], '--', color='y', marker='|')
ax2.plot(deid8[param], deid8['disp_imp'], '--', color='g', marker='|')
ax2.plot(deid16[param], deid16['disp_imp'], '--', color='b', marker='|')
ax2.set_ylabel('Disparate-Impact', color='k', fontsize=16, fontweight='bold')

ax2.grid(True)

#file_loc = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Graphs/'
file_name = data_type + '_' + classifier + '_' + param + '_' + 'Accuracy_DisparateImpact.png'
destination = file_loc + file_name

plt.savefig(destination)

In [None]:
from matplotlib import rc

fig, ax1 = plt.subplots(figsize=(10,7))
ax1.set_xlabel(title, fontsize=16, fontweight='bold')
ax1.set_ylabel('Privacy-Fainess Loss', color='k', fontsize=16, fontweight='bold')

ax1.plot(og[param], og['maj_chg_pos_outcome'], color='r', marker='o', label= r'Privileged: $\epsilon$=10.0')
ax1.plot(og[param], og['min_chg_pos_outcome'], '--', color='r', marker='|', label= 'Unprivileged: $\epsilon$=10.0')

ax1.plot(deid4[param], deid4['maj_chg_pos_outcome'], color='y', marker='o',label= 'Privileged: $\epsilon$=5.0')
ax1.plot(deid4[param], deid4['min_chg_pos_outcome'], '--', color='y', marker='|', label= 'Unprivileged: $\epsilon$=5.0')

ax1.plot(deid8[param], deid8['maj_chg_pos_outcome'], color='g', marker='o', label= 'Privileged: $\epsilon$=1.0')
ax1.plot(deid8[param], deid8['min_chg_pos_outcome'], '--', color='g', marker='|', label= 'Unprivileged: $\epsilon$=1.0')

ax1.plot(deid16[param], deid16['maj_chg_pos_outcome'], color='b', marker='o', label= 'Privileged: $\epsilon$=0.1')
ax1.plot(deid16[param], deid16['min_chg_pos_outcome'], '--', color='b', marker='|', label= 'Unprivileged: $\epsilon$=0.1')
legend = plt.legend(loc=2, prop={'size': 15}, ncol=4, framealpha=1, shadow=True, borderpad=1)
#ax2 = ax1.twinx()
ax1.axis('off')

#ax2.set_ylabel('Achieved Minority Positive Percentage', color='k', fontsize=16, fontweight='bold')
#ax2.yaxis.set_tick_params(labelsize=5)
#ax1.grid(True)

def export_legend(legend, filename="legend.png", expand=[-1,-1,1,1]):
    fig  = legend.figure
    fig.canvas.draw()
    bbox  = legend.get_window_extent()
    bbox = bbox.from_extents(*(bbox.extents + np.array(expand)))
    bbox = bbox.transformed(fig.dpi_scale_trans.inverted())
    dest = file_loc + filename
    fig.savefig(dest, dpi="figure", bbox_inches=bbox)

file_loc = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Graphs/'
file_name = data_type + '_' + classifier + '_' + param + '_' + 'Privacy_Fainess_Loss.png'
destination = file_loc + file_name
leg_dest = file_loc + 'legend.png'
#plt.savefig(destination)
export_legend(legend)

In [None]:
from matplotlib import rc

fig, ax1 = plt.subplots(figsize=(10,7))
ax1.set_xlabel(title, fontsize=16, fontweight='bold')
ax1.set_ylabel('Privacy-Fainess Loss', color='k', fontsize=16, fontweight='bold')

ax1.plot(og[param], og['maj_chg_pos_outcome'], color='r', marker='o', label= r'Privileged: No Deid')
ax1.plot(og[param], og['min_chg_pos_outcome'], '--', color='r', marker='|', label= 'Unprivileged: No Deid')

ax1.plot(deid4[param], deid4['maj_chg_pos_outcome'], color='y', marker='o',label= 'Privileged: Deid k=4')
ax1.plot(deid4[param], deid4['min_chg_pos_outcome'], '--', color='y', marker='|', label= 'Unprivileged: Deid k=4')

ax1.plot(deid8[param], deid8['maj_chg_pos_outcome'], color='g', marker='o', label= 'Privileged: Deid k=8')
ax1.plot(deid8[param], deid8['min_chg_pos_outcome'], '--', color='g', marker='|', label= 'Unprivileged: Deid k=8')

ax1.plot(deid16[param], deid16['maj_chg_pos_outcome'], color='b', marker='o', label= 'Privileged: Deid k=16')
ax1.plot(deid16[param], deid16['min_chg_pos_outcome'], '--', color='b', marker='|', label= 'Unprivileged: Deid k=16')
legend = plt.legend(loc=2, prop={'size': 15}, ncol=4, framealpha=1, shadow=True, borderpad=1)
#ax2 = ax1.twinx()
ax1.axis('off')

#ax2.set_ylabel('Achieved Minority Positive Percentage', color='k', fontsize=16, fontweight='bold')
#ax2.yaxis.set_tick_params(labelsize=5)
#ax1.grid(True)

def export_legend(legend, filename="legend-k.png", expand=[-1,-1,1,1]):
    fig  = legend.figure
    fig.canvas.draw()
    bbox  = legend.get_window_extent()
    bbox = bbox.from_extents(*(bbox.extents + np.array(expand)))
    bbox = bbox.transformed(fig.dpi_scale_trans.inverted())
    dest = file_loc + filename
    fig.savefig(dest, dpi="figure", bbox_inches=bbox)

file_loc = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Graphs/'
file_name = data_type + '_' + classifier + '_' + param + '_' + 'Privacy_Fainess_Loss.png'
destination = file_loc + file_name
leg_dest = file_loc + 'legend.png'
#plt.savefig(destination)
export_legend(legend)

In [None]:
param

In [None]:
fig, ax1 = plt.subplots(figsize=(10,7))
ax1.set_xlabel(title, fontsize=16, fontweight='bold')
ax1.set_ylabel('Accuracy Loss: Privileged Unprivileged', color='k', fontsize=16, fontweight='bold')

ax1.plot(og[param], og['accu_maj_loss'], color='r', marker='o')
ax1.plot(deid4[param], deid4['accu_maj_loss'], color='y', marker='o')
ax1.plot(deid8[param], deid8['accu_maj_loss'], color='g', marker='o')
ax1.plot(deid16[param], deid16['accu_maj_loss'], color='b', marker='o')
ax1.plot(og[param], og['accu_min_loss'], '--', color='r', marker='|')
ax1.plot(deid4[param], deid4['accu_min_loss'], '--', color='y', marker='|')
ax1.plot(deid8[param], deid8['accu_min_loss'], '--', color='g', marker='|')
ax1.plot(deid16[param], deid16['accu_min_loss'], '--', color='b', marker='|')


#ax2 = ax1.twinx()

#ax2.set_ylabel('Achieved Minority Positive Percentage', color='k', fontsize=16, fontweight='bold')
#ax2.yaxis.set_tick_params(labelsize=5)
ax1.grid(True)

file_loc = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Graphs/'
file_name = data_type + '_' + classifier + '_' + param + '_' + 'Accuracy_Loss.png'
destination = file_loc + file_name

plt.savefig(destination)

#### Load dataset and set options

In [None]:
from scipy.stats import pointbiserialr, spearmanr
df = matrix.copy()

df = df.fillna(0)

param_df = pd.DataFrame()
first = True

target_att = 'accu'
col_names = ['maj_pos', 'min_pos', 'imbr', 'deid']
metrics = ['accu', 'bal_accu', 'disp-imp', 'stat-par']

for met in metrics:
    param = []
    correlation = []
    abs_corr = []
    for c in col_names:
        if c != target_att:
            if len(df[c].unique()) <= 2:
                corr = spearmanr(df[met], df[c])[0]

            else:
                corr = spearmanr(df[met], df[c])[0]
            param.append(c)
            correlation.append(corr)
            abs_corr.append(abs(corr))
            
    if first == True:
        param_df = pd.DataFrame({'parameter': param, f'abs_corr_{met}': abs_corr})
        first = False
        #print(param_df)
    else:
        param_temp = pd.DataFrame({f'abs_corr_{met}': abs_corr})
#         print(param_df)
#         print(param_temp)    
        param_df = pd.concat([param_df, param_temp], axis=1)
    abs_corr = []

    param_df = param_df.sort_values(by=['abs_corr_accu'], ascending=False)
param_df = param_df.set_index('parameter')
param_df

### Find the optimal classification threshold from the validation set

In [None]:
fig, ax1 = plt.subplots(figsize=(10,7))
ax1.plot(class_thresh_arr, bal_acc_arr_transf)
ax1.set_xlabel('Classification Thresholds', fontsize=16, fontweight='bold')
ax1.set_ylabel('Balanced Accuracy', color='b', fontsize=16, fontweight='bold')
ax1.xaxis.set_tick_params(labelsize=14)
ax1.yaxis.set_tick_params(labelsize=14)


ax2 = ax1.twinx()
ax2.plot(class_thresh_arr, np.abs(1.0-np.array(disp_imp_arr_transf)), color='r')
ax2.set_ylabel('abs(1-disparate impact)', color='r', fontsize=16, fontweight='bold')
ax2.axvline(best_class_thresh, color='k', linestyle=':')
ax2.yaxis.set_tick_params(labelsize=14)
ax2.grid(True)

In [None]:
%matplotlib
import scipy.optimize
import functools
from numpy import *


df_og = df_deid16

fig = plt.figure(figsize=(10,10))
#ax = fig.add_subplot(111, projection='3d')
ax = fig.gca(projection='3d')

X = df_og['imbr'].to_list()
Y = df_og['min_pos'].to_list()
Z = df_og['accu'].to_list()

X = [float(x) for x in X]
Y = [float(y) for y in Y]
Z = [float(z) for z in Z]

# points = np.array((X, Y, Z), dtype=float)

# points.shape

# print(points)

ax.axes.set_yticks(Y)

# ax = plt.gca()
# ax.hold(True)


ax.scatter(X, Y, Z, c ='r', marker='o')

ax.set_xlabel('imbr')
ax.set_ylabel('min_pos')
ax.set_zlabel('accu')

plt.show()

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

xs = wines['residual sugar']
ys = wines['fixed acidity']
zs = wines['alcohol']
ax.scatter(xs, ys, zs, s=50, alpha=0.6, edgecolors='w')

ax.set_xlabel('Residual Sugar')
ax.set_ylabel('Fixed Acidity')
ax.set_zlabel('Alcohol')

shape.colors

```average odds difference = 0.5((FPR_unpriv-FPR_priv)+(TPR_unpriv-TPR_priv))``` must be close to zero for the classifier to be fair.

For a classifier trained with reweighted training data, at the best classification rate, this is indeed the case.
This implies fairness.

In [None]:
import matplotlib.cm as cmx
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

def scatter3d(x,y,z, cs, colorsMap='jet'):
    cm = plt.get_cmap(colorsMap)
    cNorm = matplotlib.colors.Normalize(vmin=min(cs), vmax=max(cs))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(x, y, z, c=scalarMap.to_rgba(cs))
    scalarMap.set_array(cs)
    fig.colorbar(scalarMap)
    plt.show()

bingle = scatter3d(df_og['imbr'], df_og['difr'], df_og['accu'], )
xs = df_og['imbr']
ys = df_og['difr']
zs = df_og['accu']

print(xs.shape)
# col = [cm(float(i)/(40)) for i in xrange(40)]

col = np.arange(40)
ax.scatter(xs, ys, zs, s=20, c=col)



ax.set_xlabel('Imbalance')
ax.set_ylabel('Differential Positives')
ax.set_zlabel('Accuracy')

# Summary of Results
We show the optimal classification thresholds, and the fairness and accuracy metrics.

### Classification Thresholds

| Dataset |Classification threshold|
|-|-|
|Adult||0.2674|
|German|0.6732|
|Compas|0.5148|

### Fairness Metric: Disparate impact, Accuracy Metric: Balanced accuracy

#### Performance

| Dataset |Sex (Acc-Bef)|Sex (Acc-Aft)|Sex (Fair-Bef)|Sex (Fair-Aft)|Race/Age (Acc-Bef)|Race/Age (Acc-Aft)|Race/Age (Fair-Bef)|Race/Age (Fair-Aft)|
|-|-|-|-|-|-|-|-|-|
|Adult (Test)|0.7417|0.7128|0.2774|0.7625|0.7417|0.7443|0.4423|0.7430|
|German (Test)|0.6524|0.6460|0.9948|1.0852|0.6524|0.6460|0.3824|0.5735|
|Compas (Test)|0.6774|0.6562|0.6631|0.8342|0.6774|0.6342|0.6600|1.1062|



### Fairness Metric: Average odds difference, Accuracy Metric: Balanced accuracy

#### Performance

| Dataset |Sex (Acc-Bef)|Sex (Acc-Aft)|Sex (Fair-Bef)|Sex (Fair-Aft)|Race/Age (Acc-Bef)|Race/Age (Acc-Aft)|Race/Age (Fair-Bef)|Race/Age (Fair-Aft)|
|-|-|-|-|-|-|-|-|-|
|Adult (Test)|0.7417|0.7128|-0.3281|-0.0266|0.7417|0.7443|-0.1991|-0.0395|
|German (Test)|0.6524|0.6460|0.0071|0.0550|0.6524|0.6460|-0.3278|-0.1944|
|Compas (Test)|0.6774|0.6562|-0.2439|-0.0946|0.6774|0.6342|-0.1927|0.1042|

In [None]:
biased_class = 'Gender'
privileged_groups = [{biased_class: 1}]
unprivileged_groups = [{biased_class: 0}]

numeric = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

imb_list = ['1', '2', '5', '10', '20', '50', '100']
dif_list = ['50_40', '50_20', '50_10', '50_5', '50_1', '40_20', '40_10', '40_5', '40_1', '20_10', '20_5', '20_1', '10_5', '10_1', '5_1']
deid_list = ['og', 'deid4', 'deid8', 'deid16']

#matrix = pd.read_csv(r'/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/all_data_matrix_exclude_protected.csv', dtype=str)
file_loc = '/home/andrew/python-virtual-environments/AIF360/aif360/data/raw/artificial/TESTY/'

indexer = []
imbr_list = []
deidr_list = []
maj_pos_list  = []
min_pos_list = []
accu_list = []
maj_acc_list = []
min_acc_list = []
min_chg_pos_outcome_list = []
maj_chg_pos_outcome_list = []
disp_imp_list = []
num_pos_outcome = []

accu_list_std_dev = []
maj_acc_list_std_dev = []
min_acc_list_std_dev = []
min_chg_pos_outcome_list_std_dev = []
maj_chg_pos_outcome_list_std_dev = []
disp_imp_list_std_dev = []
num_pos_outcome_std_dev = []

all_datasets = {}


tic = time.clock()
for imbr in imb_list:
    print(f'imbr: {imbr}')
    for difr in dif_list:
        for deid in deid_list:
            file_name = imbr + '_' + difr + '_' + deid
            indexer.append(file_name)
            imbr_list.append(imbr)
            maj_pos_list.append(int(difr.split('_')[0]))
            min_pos_list.append(int(difr.split('_')[1]))
            deidr_list.append(deid)
            origin = file_loc + file_name

            if deid == 'og':      
                all_datasets[file_name] = {}
                all_datasets[file_name]['base_data'] = load_preproc_artificial_all_og(file_name, file_loc, protected_attributes = [biased_class])
            else:
                all_datasets[file_name] = {}
                all_datasets[file_name]['base_data'] = load_preproc_artificial_all_deid(file_name, file_loc, protected_attributes = [biased_class])


            df = all_datasets[file_name]['base_data'].convert_to_dataframe()[0]
            #print(df.head())
            
            df = scale_it(df, numeric)
            

            accuracy = []
            accuracy_std_dev = []
            disparate = []
            disparate_std_dev = []
            min_acc = []
            min_acc_std_dev = []
            maj_acc = []
            maj_acc_std_dev = []
            min_change = []
            min_change_std_dev = []
            maj_change = []
            maj_change_std_dev = []
            num_pos = []
            num_pos_std_dev = []

            test = 10
            for i in range(test):
                #print(f'imbr: {imbr}, difr: {difr}')
                X_train, X_test, y_train, y_test = split_it(df) 
                num_pos_og, accuracy_test, accuracy_train, acc_min_test, acc_maj_test, min_chg_pos_outcome, maj_chg_pos_outcome, disp_imp_test = gaussian_nb(X_train, X_test, y_train, y_test)

                accuracy.append(accuracy_test)
                disparate.append(disp_imp_test)
                min_acc.append(acc_min_test)
                maj_acc.append(acc_maj_test)
                min_change.append(min_chg_pos_outcome)
                maj_change.append(maj_chg_pos_outcome)
                num_pos.append(num_pos_og)
                
                
                
            
            accu_list.append(mean(accuracy))
            maj_acc_list.append(mean(min_acc))
            min_acc_list.append(mean(maj_acc))
            min_chg_pos_outcome_list.append(mean(min_change))
            maj_chg_pos_outcome_list.append(mean(maj_change))
            disp_imp_list.append(mean(disparate))
            num_pos_outcome.append(mean(num_pos))
            
            accu_list_std_dev.append(stdev(accuracy))
            maj_acc_list_std_dev.append(stdev(min_acc))
            min_acc_list_std_dev.append(stdev(maj_acc))
            min_chg_pos_outcome_list_std_dev.append(stdev(min_change))
            maj_chg_pos_outcome_list_std_dev.append(stdev(maj_change))
            disp_imp_list_std_dev.append(stdev(disparate))
            num_pos_outcome_std_dev.append(stdev(num_pos))
            
toc = time.clock()            
tic_toc = toc-tic       
            

matrix = pd.DataFrame({'dataset': indexer})
matrix['imbr'] = imbr_list
matrix['maj_pos'] = maj_pos_list
matrix['min_pos'] = min_pos_list
matrix['deid'] = deidr_list   
matrix['accu'] = accu_list
matrix['maj_accu'] = maj_acc_list
matrix['min_accu'] = min_acc_list
matrix['min_chg_pos_outcome'] = min_chg_pos_outcome_list
matrix['maj_chg_pos_outcome'] = maj_chg_pos_outcome_list
matrix['disp_imp'] = disp_imp_list
matrix['num_pos_outcome'] = num_pos_outcome

matrix['accu_std_dev'] = accu_list_std_dev
matrix['maj_accu_std_dev'] = maj_acc_list_std_dev
matrix['min_accu_std_dev'] = min_acc_list_std_dev
matrix['min_chg_pos_outcome_std_dev'] = min_chg_pos_outcome_list_std_dev
matrix['maj_chg_pos_outcome_std_dev'] = maj_chg_pos_outcome_list_std_dev
matrix['disp_imp_std_dev'] = disp_imp_list_std_dev
matrix['num_pos_outcome_std_dev'] = num_pos_outcome_std_dev

file_dest = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/'
file_name = file_dest + 'all_data_matrix_exclude_protected_imb_dif_' + 'GAUSIAN_NB_' + str(tic_toc)
# matrix.to_csv(file_name, index=False)
matrix
print(f'COMPLETE IN: {tic_toc}')

In [None]:
biased_class = 'Gender'
privileged_groups = [{biased_class: 1}]
unprivileged_groups = [{biased_class: 0}]

numeric = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

imb_list = ['1']#, '2', '5', '10', '20', '50', '100']
dif_list = ['50_40']#, '50_20', '50_10', '50_5', '50_1', '40_20', '40_10', '40_5', '40_1', '20_10', '20_5', '20_1', '10_5', '10_1', '5_1']
deid_list = ['og']#, 'deid4', 'deid8', 'deid16']

#matrix = pd.read_csv(r'/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/all_data_matrix_exclude_protected.csv', dtype=str)
file_loc = '/home/andrew/python-virtual-environments/AIF360/aif360/data/raw/artificial/TESTY/'

indexer = []
imbr_list = []
deidr_list = []
maj_pos_list  = []
min_pos_list = []
accu_list = []
maj_acc_list = []
min_acc_list = []
min_chg_pos_outcome_list = []
maj_chg_pos_outcome_list = []
disp_imp_list = []
num_pos_outcome = []

accu_list_std_dev = []
maj_acc_list_std_dev = []
min_acc_list_std_dev = []
min_chg_pos_outcome_list_std_dev = []
maj_chg_pos_outcome_list_std_dev = []
disp_imp_list_std_dev = []
num_pos_outcome_std_dev = []

all_datasets = {}


tic = time.clock()
for imbr in imb_list:
    print(f'imbr: {imbr}')
    for difr in dif_list:
        for deid in deid_list:
            file_name = imbr + '_' + difr + '_' + deid
            indexer.append(file_name)
            imbr_list.append(imbr)
            maj_pos_list.append(int(difr.split('_')[0]))
            min_pos_list.append(int(difr.split('_')[1]))
            deidr_list.append(deid)
            origin = file_loc + file_name

            if deid == 'og':      
                all_datasets[file_name] = {}
                all_datasets[file_name]['base_data'] = load_preproc_artificial_all_og(file_name, file_loc, protected_attributes = [biased_class])
            else:
                all_datasets[file_name] = {}
                all_datasets[file_name]['base_data'] = load_preproc_artificial_all_deid(file_name, file_loc, protected_attributes = [biased_class])


            df = all_datasets[file_name]['base_data'].convert_to_dataframe()[0]
            #print(df.head())
            
            df = scale_it(df, numeric)
            
            test = 1
            accuracy = []
            accuracy_std_dev = []
            disparate = []
            disparate_std_dev = []
            min_acc = []
            min_acc_std_dev = []
            maj_acc = []
            maj_acc_std_dev = []
            min_change = []
            min_change_std_dev = []
            maj_change = []
            maj_change_std_dev = []
            num_pos = []
            num_pos_std_dev = []
            
            for i in range(test):
                X_train, X_test, y_train, y_test = split_it(df) 
                num_pos_og, accuracy_test, accuracy_train, acc_min_test, acc_maj_test, min_chg_pos_outcome, maj_chg_pos_outcome, disp_imp_test = dif_priv_log(X_train, X_test, y_train, y_test)

                
                
                accuracy = []
                epsilons = np.logspace(-3, 1, 500)

                for eps in epsilons:
                    dp_clf = dp.LogisticRegression(epsilon=eps, data_norm=100)
                    dp_clf.fit(X_train, y_train)
                    accuracy.append(dp_clf.score(X_test, y_test))
                
#                 accuracy.append(accuracy_test)
#                 disparate.append(disp_imp_test)
#                 min_acc.append(acc_min_test)
#                 maj_acc.append(acc_maj_test)
#                 min_change.append(min_chg_pos_outcome)
#                 maj_change.append(maj_chg_pos_outcome)
#                 num_pos.append(num_pos_og)
                
                
                
            
#             accu_list.append(mean(accuracy))
#             maj_acc_list.append(mean(min_acc))
#             min_acc_list.append(mean(maj_acc))
#             min_chg_pos_outcome_list.append(mean(min_change))
#             maj_chg_pos_outcome_list.append(mean(maj_change))
#             disp_imp_list.append(mean(disparate))
#             num_pos_outcome.append(mean(num_pos))
            
#             accu_list_std_dev.append(stdev(accuracy))
#             maj_acc_list_std_dev.append(stdev(min_acc))
#             min_acc_list_std_dev.append(stdev(maj_acc))
#             min_chg_pos_outcome_list_std_dev.append(stdev(min_change))
#             maj_chg_pos_outcome_list_std_dev.append(stdev(maj_change))
#             disp_imp_list_std_dev.append(stdev(disparate))
#             num_pos_outcome_std_dev.append(stdev(num_pos))
            
toc = time.clock()            
tic_toc = toc-tic       
            

# matrix = pd.DataFrame({'dataset': indexer})
# matrix['imbr'] = imbr_list
# matrix['maj_pos'] = maj_pos_list
# matrix['min_pos'] = min_pos_list
# matrix['deid'] = deidr_list   
# matrix['accu'] = accu_list
# matrix['maj_accu'] = maj_acc_list
# matrix['min_accu'] = min_acc_list
# matrix['min_chg_pos_outcome'] = min_chg_pos_outcome_list
# matrix['maj_chg_pos_outcome'] = maj_chg_pos_outcome_list
# matrix['disp_imp'] = disp_imp_list
# matrix['num_pos_outcome'] = num_pos_outcome

# matrix['accu_std_dev'] = accu_list_std_dev
# matrix['maj_accu_std_dev'] = maj_acc_list_std_dev
# matrix['min_accu_std_dev'] = min_acc_list_std_dev
# matrix['min_chg_pos_outcome_std_dev'] = min_chg_pos_outcome_list_std_dev
# matrix['maj_chg_pos_outcome_std_dev'] = maj_chg_pos_outcome_list_std_dev
# matrix['disp_imp_std_dev'] = disp_imp_list_std_dev
# matrix['num_pos_outcome_std_dev'] = num_pos_outcome_std_dev

# file_dest = '/home/andrew/python-virtual-environments/Deid-Mitigation-Comparison/Outputs/'
# file_name = file_dest + 'all_data_matrix_exclude_protected_imb_dif_' + 'dif_priv_GAUSIAN_NB_' + str(tic_toc)
# matrix.to_csv(file_name, index=False)
# matrix
print(f'COMPLETE IN: {tic_toc}')
print(max(accuracy))
print(accuracy.index(max(accuracy)))
epsilons[accuracy.index(max(accuracy))]

import matplotlib.pyplot as plt
import pickle

pickle.dump((epsilons, accuracy), open("lr_accuracy_500.p", "wb" ) )

epsilons, accuracy = pickle.load(open("lr_accuracy_500.p", "rb"))

plt.semilogx(epsilons, accuracy, label="Differentially private")
#plt.plot(epsilons, np.ones_like(epsilons) * baseline, dashes=[2,2], label="Non-private")
plt.title("Differentially private logistic regression accuracy")
plt.xlabel("epsilon")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.xlim(epsilons[0], epsilons[-1])
plt.legend(loc=3)
plt.show()