In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import sys
sys.path.append("../")

from aif360.sklearn.metrics import statistical_parity_difference
from aif360.sklearn.metrics import equal_opportunity_difference
from sklearn.metrics import accuracy_score,f1_score
from scipy.stats import hmean

from lib.utils import *

In [2]:
def get_results(source,priv_group,pos_label,suf=''):
    root_path = f'../log/cv/None/{source}{suf}/'
    # print(os.listdir(root_path))
    results = pd.DataFrame()
    prot_all = load_prot('../data/'+str(source)+'.mat').flatten()
    for option_set in os.listdir(root_path):
        missing = False
        path = f'{root_path}{option_set}/'
        if not os.path.isdir(path):continue
            
        # print(f'OPTION SET: {option_set}')
    
        macro_list = []
        micro_list = []
        sp_list = []
        eo_list = []
        eval_list = []
        for fold in os.listdir(path):
            # print(f'\tfold: {fold}')
            tmp = f'{path}{fold}/results'
            if not os.path.isdir(tmp):
                missing = True
                # print(f'WARNING: no results for {fold}')
                continue
            mask_path = f'{path}{fold}/results/test_mask.csv' # validation mask
            pred_path = f'{path}{fold}/results/pred.csv'
            label_path = f'{path}{fold}/results/y.csv'
            
            mask = np.loadtxt(mask_path).astype(bool)
            pred = np.loadtxt(pred_path).astype(np.int64)
            pred=pred[mask]
            label = np.loadtxt(label_path).astype(np.int64)
            label=label[mask]
            prot = prot_all[mask]
            
            macro = f1_score(label, pred, average='macro')
            macro_list.append(macro)
    
            micro = f1_score(label, pred, average='micro')
            micro_list.append(micro)
    
            sp = statistical_parity_difference(label, pred, prot_attr=prot,
                                              priv_group=priv_group, pos_label=pos_label)
            sp = abs(sp)
            sp_list.append(sp)
    
            eo = equal_opportunity_difference(label, pred, prot_attr=prot,
                                              priv_group=priv_group, pos_label=pos_label)
            eo = abs(eo) 
            eo_list.append(eo)
    
            # eval = hmean([macro, 1.0-eo])
            eval = macro
            eval_list.append(eval)
            
            
        # print(seed)
    #     pred, label, prot = self._getResult(seed,set)
    #     r = f1_score(label, pred, average='macro')
    #     results.append(r)
        if missing:
            print(f'WARNING: no results for {option_set}')
            continue
        macro_mean = np.average(macro_list)
        # macro_std = np.std(macro_list)
        micro_mean = np.average(micro_list)
        sp_mean = np.average(sp_list)
        eo_mean = np.average(eo_list)
        eval_mean = np.average(eval_list)
        
        result={
            'set':option_set,
            'macro':macro_mean,
            'micro':micro_mean,
            'sp':sp_mean,
            'eo':eo_mean,
            'eval':eval_mean,
        }
        tmp = pd.DataFrame([result])
        results = pd.concat([results, tmp], ignore_index=True)
        
        # print(f'macro: {macro_mean:.5f} +/- {macro_std:.5f}')
    # return(mean,std)
    results
    results = results.sort_values(by=['eval'], ascending=False)
    return(results)
    # results.head(10)

In [3]:
def get_ppmi_results(source,priv_group,pos_label,suf=''):
    root_path = f'../log/cv/ppmi/{source}{suf}/'
    # print(os.listdir(root_path))
    results = pd.DataFrame()
    prot_all = load_prot('../data/'+str(source)+'.mat').flatten()
    for option_set in os.listdir(root_path):
        missing = False
        path = f'{root_path}{option_set}/'
        if not os.path.isdir(path):continue
            
        # print(f'OPTION SET: {option_set}')
    
        macro_list = []
        micro_list = []
        sp_list = []
        eo_list = []
        eval_list = []
        for fold in os.listdir(path):
            # print(f'\tfold: {fold}')
            tmp = f'{path}{fold}/results'
            if not os.path.isdir(tmp):
                missing = True
                # print(f'WARNING: no results for {fold}')
                continue
            mask_path = f'{path}{fold}/results/test_mask.csv' # validation mask
            pred_path = f'{path}{fold}/results/pred.csv'
            label_path = f'{path}{fold}/results/y.csv'
            
            mask = np.loadtxt(mask_path).astype(bool)
            pred = np.loadtxt(pred_path).astype(np.int64)
            pred=pred[mask]
            label = np.loadtxt(label_path).astype(np.int64)
            label=label[mask]
            prot = prot_all[mask]
            
            macro = f1_score(label, pred, average='macro')
            macro_list.append(macro)
    
            micro = f1_score(label, pred, average='micro')
            micro_list.append(micro)
    
            sp = statistical_parity_difference(label, pred, prot_attr=prot,
                                              priv_group=priv_group, pos_label=pos_label)
            sp = abs(sp)
            sp_list.append(sp)
    
            eo = equal_opportunity_difference(label, pred, prot_attr=prot,
                                              priv_group=priv_group, pos_label=pos_label)
            eo = abs(eo) 
            eo_list.append(eo)
    
            # eval = hmean([macro, 1.0-eo])
            eval = macro
            eval_list.append(eval)
            
            
        # print(seed)
    #     pred, label, prot = self._getResult(seed,set)
    #     r = f1_score(label, pred, average='macro')
    #     results.append(r)
        if missing:
            print(f'WARNING: no results for {option_set}')
            continue
        macro_mean = np.average(macro_list)
        # macro_std = np.std(macro_list)
        micro_mean = np.average(micro_list)
        sp_mean = np.average(sp_list)
        eo_mean = np.average(eo_list)
        eval_mean = np.average(eval_list)
        
        result={
            'set':option_set,
            'macro':macro_mean,
            'micro':micro_mean,
            'sp':sp_mean,
            'eo':eo_mean,
            'eval':eval_mean,
        }
        tmp = pd.DataFrame([result])
        results = pd.concat([results, tmp], ignore_index=True)
        
        # print(f'macro: {macro_mean:.5f} +/- {macro_std:.5f}')
    # return(mean,std)
    results
    results = results.sort_values(by=['eval'], ascending=False)
    return(results)
    # results.head(10)

In [4]:
source = 'abide_large'
PRIV_GROUP = 1 # female=0, male=1
POS_LABEL = 1 # is autism: 1, not autism: 0

results = get_results(source,PRIV_GROUP,POS_LABEL)
results.head(10)

Unnamed: 0,set,macro,micro,sp,eo,eval
6,22,0.959093,0.959894,0.181077,0.085868,0.959093
13,20,0.95894,0.959747,0.193898,0.125337,0.95894
22,21,0.95894,0.959747,0.193898,0.125337,0.95894
0,24,0.958862,0.959731,0.181634,0.089952,0.958862
3,12,0.957974,0.958676,0.198238,0.119201,0.957974
1,23,0.957588,0.958449,0.180163,0.092516,0.957588
20,26,0.957581,0.958419,0.188636,0.086381,0.957581
16,11,0.956914,0.957634,0.20538,0.134201,0.956914
4,13,0.956775,0.957506,0.202815,0.136833,0.956775
7,25,0.956551,0.957463,0.178329,0.088945,0.956551


In [5]:
source = 'abide_large'
PRIV_GROUP = 1 # female=0, male=1
POS_LABEL = 1 # is autism: 1, not autism: 0

results = get_ppmi_results(source,PRIV_GROUP,POS_LABEL)
results.head(10)

Unnamed: 0,set,macro,micro,sp,eo,eval
57,10,0.603872,0.650657,0.092454,0.226221,0.603872
59,19,0.602188,0.649028,0.09462,0.226221,0.602188
36,1,0.59963,0.647037,0.082719,0.220149,0.59963
5,46,0.420968,0.576774,0.044618,0.055714,0.420968
35,37,0.388786,0.561274,0.009602,0.007143,0.388786
44,20,0.373301,0.543346,0.00122,0.0425,0.373301
47,11,0.373301,0.543346,0.00122,0.0425,0.373301
74,2,0.372299,0.542304,0.003484,0.04,0.372299
13,25,0.351562,0.539179,0.00122,0.0025,0.351562
62,4,0.351562,0.539179,0.00122,0.0025,0.351562


In [6]:
alpha_sets_0 = ['010','011','012','013','014','015','016','017','018']
alpha_sets_1 = ['037','038','039','040','041','042','043','044','045']
alpha_sets_2 = ['064','065','066','067','068','069','070','071','072']
tmp = results[results['set'].isin(alpha_sets_0+alpha_sets_1+alpha_sets_2)]
tmp.sort_values(by=['eval'], ascending=False)

Unnamed: 0,set,macro,micro,sp,eo,eval
57,10,0.603872,0.650657,0.092454,0.226221,0.603872
35,37,0.388786,0.561274,0.009602,0.007143,0.388786
47,11,0.373301,0.543346,0.00122,0.0425,0.373301
17,40,0.349101,0.538137,0.0,0.0,0.349101
32,64,0.349101,0.538137,0.0,0.0,0.349101
37,39,0.349101,0.538137,0.0,0.0,0.349101
39,42,0.349101,0.538137,0.0,0.0,0.349101
20,65,0.349101,0.538137,0.0,0.0,0.349101
2,15,0.349101,0.538137,0.0,0.0,0.349101
3,12,0.349101,0.538137,0.0,0.0,0.349101


In [7]:
source = 'compas_0'
PRIV_GROUP = 0 # Caucasian:0, African American:1
POS_LABEL = 1 # is recid: 1, not recid: 0

results = get_results(source,PRIV_GROUP,POS_LABEL)
results.head(10)

Unnamed: 0,set,macro,micro,sp,eo,eval
64,35,0.660093,0.664563,0.195961,0.217511,0.660093
71,34,0.659497,0.663183,0.18666,0.199132,0.659497
73,5,0.658619,0.663069,0.181704,0.203218,0.658619
63,32,0.658258,0.662188,0.181987,0.194195,0.658258
28,31,0.658254,0.661648,0.185914,0.194525,0.658254
27,36,0.656814,0.663287,0.19724,0.2231,0.656814
34,8,0.656396,0.661798,0.190684,0.218601,0.656396
65,61,0.656294,0.661768,0.186972,0.211589,0.656294
62,4,0.656067,0.660338,0.186684,0.207184,0.656067
38,6,0.655631,0.660802,0.182309,0.20564,0.655631


In [8]:
source = 'compas_1'
PRIV_GROUP = 0 # Caucasian:0, African American:1
POS_LABEL = 1 # is recid: 1, not recid: 0

results = get_results(source,PRIV_GROUP,POS_LABEL)
results.head(10)

Unnamed: 0,set,macro,micro,sp,eo,eval
63,32,0.657494,0.661589,0.155029,0.186987,0.657494
25,7,0.657017,0.660885,0.14644,0.172531,0.657017
71,34,0.65353,0.657275,0.156125,0.183807,0.65353
62,4,0.652871,0.656793,0.129588,0.148406,0.652871
34,8,0.652349,0.657338,0.154973,0.173222,0.652349
28,31,0.651954,0.655661,0.156638,0.1709,0.651954
72,33,0.650521,0.655734,0.159877,0.180237,0.650521
33,30,0.650391,0.655105,0.161384,0.172568,0.650391
49,29,0.649932,0.654243,0.153097,0.175287,0.649932
56,28,0.647947,0.652096,0.155762,0.185191,0.647947


In [9]:
source = 'pokec_trim_n_s__norm'
suf = '__l01'
PRIV_GROUP = 0 # MALE:0, FEMALE:1
POS_LABEL = 1 # SMOKE REGULARLY: 1, DOES NOT SMOKE: 0

results = get_results(source,PRIV_GROUP,POS_LABEL,suf)
results.head(10)

FileNotFoundError: [Errno 2] No such file or directory: '../log/cv/None/pokec_trim_n_s__norm__l01/'

In [None]:
source = 'pokec_trim_n_s__norm'
suf = ''
PRIV_GROUP = 0 # MALE:0, FEMALE:1
POS_LABEL = 1 # SMOKE REGULARLY: 1, DOES NOT SMOKE: 0
 
results = get_results(source,PRIV_GROUP,POS_LABEL,suf)
results.head(10)

In [None]:
results[results['set']=='031']

In [None]:
results.sort_values(by=['sp'], ascending=True).head(5)

In [None]:
results.sort_values(by=['eo'], ascending=True).head(5)

In [None]:
source = 'pokec_trim_z_s__norm'
suf = '__l01'
PRIV_GROUP = 0 # MALE:0, FEMALE:1
POS_LABEL = 1 # SMOKE REGULARLY: 1, DOES NOT SMOKE: 0

results = get_results(source,PRIV_GROUP,POS_LABEL,suf)
results.head(10)

In [None]:
source = 'pokec_trim_z_s__norm'
suf=''
PRIV_GROUP = 0 # MALE:0, FEMALE:1
POS_LABEL = 1 # SMOKE REGULARLY: 1, DOES NOT SMOKE: 0

results = get_results(source,PRIV_GROUP,POS_LABEL,suf)
results.head(10)

In [None]:
results[results['set']=='058']

In [None]:
source = 'compas_1'
PRIV_GROUP = 0 # Caucasian:0, African American:1
POS_LABEL = 0 # is recid: 1, not recid: 0

results = get_results(source,PRIV_GROUP,POS_LABEL)
results.head(10)