In [1]:
def calculate_mean_std(df_names, dfs):
    perf_train, perf_fair = [], []

    for df_name, df in zip(df_names, dfs):
        df_train = df[df['data'] == 'train_cont_ord_cat'].drop(columns=['data'])
        df_fair = df[df['data'] == 'fair_cont_ord_cat'].drop(columns=['data'])
        
        df_train_mean = df_train.mean()
        df_train_std = df_train.std()
        #df_train_median = df_train.median()
        df_train_mean['dataset'] = df_name
        df_train_std['dataset'] = df_name
        #df_train_median['dataset'] = df_name
        df_train_mean['type'] = 'mean'
        #df_train_median['type'] = 'median'
        df_train_std['type'] = 'std'
        perf_train.append(df_train_mean.to_frame().T)
        #perf_train.append(df_train_median.to_frame().T)
        perf_train.append(df_train_std.to_frame().T)
        
        df_fair_mean = df_fair.mean()
        #df_fair_median = df_fair.median()
        df_fair_std = df_fair.std()
        df_fair_mean['dataset'] = df_name
        #df_fair_median['dataset'] = df_name
        df_fair_std['dataset'] = df_name
        df_fair_mean['type'] = 'mean'
        #df_fair_median['type'] = 'median'
        df_fair_std['type'] = 'std'
        perf_fair.append(df_fair_mean.to_frame().T)
        #perf_fair.append(df_fair_median.to_frame().T)
        perf_fair.append(df_fair_std.to_frame().T)
    
    perf_train = pd.concat(perf_train).reset_index(drop=True)
    perf_fair = pd.concat(perf_fair).reset_index(drop=True)
    return perf_train, perf_fair

In [2]:
from scipy import stats

def calculate_test_fair(dataset, algorithm):
    df_results = {'classifier': [], 'algorithm': [], 'statistical_parity': [], 'average_absolute_odds': [], 'equal_opportunity': []}
    features = ['statistical_parity', 'average_absolute_odds', 'equal_opportunity']
    unique_classifiers = pd.unique(dataset['classifier'])
    unique_algorithms= pd.unique(dataset['algorithm'])
    
    for u in unique_classifiers:
        for a in unique_algorithms:
            if a != algorithm:
                df_results['classifier'].append(u)
                df_results['algorithm'].append(a)
                for feature in features:
                    s1 = dataset[(dataset['classifier'] == u) & (dataset['algorithm'] == a)].drop_duplicates(subset=['iteration']).sort_values(by=['iteration'])[feature].abs()
                    s2 = dataset[(dataset['classifier'] == u) & (dataset['algorithm'] == algorithm)].sort_values(by=['iteration'])[feature].abs()
                    s1_normal = stats.shapiro(s1)
                    s2_normal = stats.shapiro(s2)
                    if s1_normal and s2_normal:
                        mean_s1 = np.mean(s1)
                        mean_s2 = np.mean(s2)
                        p = stats.ttest_rel(s1, s2)[1]
                        if p < 0.05:
                            if mean_s1 > mean_s2:
                                df_results[feature].append('+')
                            else:
                                df_results[feature].append('-')
                        else:
                            df_results[feature].append('=')
                    else:
                        df_results[feature].append('?')
    return pd.DataFrame(df_results)

def calculate_test_pred(dataset, algorithm):
    df_results = {'classifier': [], 'algorithm': [], 'accuracy': [], 'f1': [], 'gmean': []}
    features = ['accuracy', 'gmean', 'f1']
    unique_classifiers = pd.unique(dataset['classifier'])
    unique_algorithms= pd.unique(dataset['algorithm'])
   
    for u in unique_classifiers:
        for a in unique_algorithms:
            if a != algorithm:
                df_results['classifier'].append(u)
                df_results['algorithm'].append(a)
                for feature in features:
                    s1 = dataset[(dataset['classifier'] == u) & (dataset['algorithm'] == a)].drop_duplicates(subset=['iteration']).sort_values(by=['iteration'])[feature].abs()
                    s2 = dataset[(dataset['classifier'] == u) & (dataset['algorithm'] == algorithm)].sort_values(by=['iteration'])[feature].abs()
                    s1_normal = stats.shapiro(s1)
                    s2_normal = stats.shapiro(s2)
                    if s1_normal and s2_normal:
                        mean_s1 = np.mean(s1)
                        mean_s2 = np.mean(s2)
                        p = stats.ttest_rel(s1, s2)[1]
                        if p < 0.05:
                            if mean_s1 < mean_s2:
                                df_results[feature].append('+')
                            else:
                                df_results[feature].append('-')
                        else:
                            df_results[feature].append('=')
                    else:
                        df_results[feature].append('?')
    return pd.DataFrame(df_results)

In [3]:
def aggregate_runs(dfs, drop_columns = None):
    df = pd.concat(dfs).reset_index(drop=True)
    df.drop(columns=['iteration'], inplace=True)
    if drop_columns is not None:
        df.drop(columns=drop_columns, inplace=True)
    df_num_cols = [c for c in df.columns if c not in ['classifier', 'algorithm']]
    df.loc[:, df_num_cols] = df.loc[:, df_num_cols].abs()
    df = df.groupby(['classifier', 'algorithm']).agg(['mean', 'std'])
    return df

In [4]:
def print_to_latex(dfs_f, dfs_p, drop_columns_f=None, drop_columns_p=None):
    df_f = aggregate_runs(dfs_f, drop_columns=drop_columns_f)
    df_p = aggregate_runs(dfs_p, drop_columns=drop_columns_p)
    df = df_f.join(df_p)
    print(df.to_latex(index=True,
                  float_format="{:.4f}".format,
))  

In [5]:
def aggregate_runs_c(dfs, drop_columns=None, fair=True):
    """
    Aggregates runs by concatenating DataFrames, dropping specified columns,
    taking the absolute value of numerical columns, and calculating mean and
    standard deviation grouped by 'classifier' and 'algorithm'.
    """
    df = pd.concat(dfs).reset_index(drop=True)
    df.drop(columns=['iteration'], inplace=True)
    if drop_columns is not None:
        df.drop(columns=drop_columns, inplace=True)
    df['classifier'].replace({'decision_tree': 'Decision Tree', 'logistic_regression': 'Logistic Regression', 'mlp': 'MLP'}, inplace=True)
    df['algorithm'].replace({'fair_rbo': 'Fair-RBO', 'fair_rbh': 'Fair-RBH', 'fos': 'FOS', 'fawos': 'FAWOS', 'hfos': 'HFOS'}, inplace=True)
    if fair:
        df.rename(columns={'statistical_parity': 'SPD', 'average_absolute_odds': 'AAO', 'equal_opportunity': 'EOD'}, inplace=True)
    else:
        df.rename(columns={'accuracy': 'Accuracy', 'f1': 'F1', 'gmean': 'G-mean'}, inplace=True)
    df_num_cols = [c for c in df.columns if c not in ['classifier', 'algorithm']]
    df.loc[:, df_num_cols] = df.loc[:, df_num_cols].abs()
    
    grouped = df.groupby(['classifier', 'algorithm']).agg(['mean', 'std'])
    
    # Combine mean and std into one cell
    combined = grouped.copy()
    for col in df_num_cols:
        combined[(col, 'mean')] = grouped[(col, 'mean')].map('{:.4f}'.format) + ' (±' + grouped[(col, 'std')].map('{:.3f}'.format) + ')'
        combined.drop((col, 'std'), axis=1, inplace=True)
    
    #combined.columns = [col[0] for col in combined.columns]  # Flatten the column index
    return combined

def print_to_latex_c(dfs_f, dfs_p, drop_columns_f=None, drop_columns_p=None):
    """
    Aggregates and joins two sets of DataFrames, and prints the resulting
    DataFrame to LaTeX format.
    """
    df_f = aggregate_runs_c(dfs_f, drop_columns=drop_columns_f, fair=True)
    df_p = aggregate_runs_c(dfs_p, drop_columns=drop_columns_p, fair=False)
    df = df_f.join(df_p)
    
    print(df.to_latex(index=True, escape=False))

In [6]:
from matplotlib import pyplot as plt
import seaborn as sns

def aggregate_runs_no_mean(dfs, y):
    df = pd.concat(dfs).reset_index(drop=True)
    df_num_cols = [c for c in df.columns if c not in ['classifier', 'algorithm']]
    df.loc[:, df_num_cols] = df.loc[:, df_num_cols].abs()
    g = sns.FacetGrid(df, col="classifier")
    g.map_dataframe(sns.lineplot, x='iteration', y=y, hue='algorithm', errorbar=None)
    g.add_legend()
    plt.show()

In [7]:
import numpy as np


def average_ranking(dfs, drop_columns=None, ascending=True):
    df = pd.concat(dfs).reset_index(drop=True)
    if drop_columns is not None:
        df.drop(columns=drop_columns, inplace=True)
    rank_dfs = []
    df_num_cols = [c for c in df.columns if c not in ['classifier', 'algorithm', 'iteration']]
    #df.drop(columns=['iteration'], inplace=True)
    for i in np.unique(df['iteration']):
        for classifier in np.unique(df['classifier']):
            df_small = df[(df['iteration'] == i) & (df['classifier'] == classifier)]
            df_small = df_small.drop_duplicates(subset=['algorithm'])
            df_small.loc[:, df_num_cols] = df_small.loc[:, df_num_cols].rank(ascending=ascending)
            rank_dfs.append(df_small)
    df = pd.concat(rank_dfs)
    df.drop(columns=['iteration'], inplace=True)
    df_num_cols = [c for c in df.columns if c not in ['classifier', 'algorithm']]
    df.loc[:, df_num_cols] = df.loc[:, df_num_cols].abs()
    df = df.groupby(['classifier', 'algorithm']).agg(['mean'])
    return df

# Cont ord cat

In [8]:
import os
import pandas as pd

dfs_fairness = []
dfs_performance = []
dataset_names = ['heart_disease',]#'german', 'adult', 'bank']
# = ['adult_' + i[:-4] for i in os.listdir('../data/adult_census/sampled_all/new')]# if 'strongly_imbalanced_g_strongly_imbalanced_c' not in i]
classifier_names = ['decision_tree', 'mlp', 'logistic_regression']
algorithm_names = ['fair_rbh', 'fos', 'hfos', 'fawos', 'fair_rbo', ]# 'fawos_hybrid']
folder_names = ['mean', '2024-06-29', '2024-06-29', '2024-06-29', '2024-06-29']
main_path = '../results'

print(dataset_names)

for dataset_name in dataset_names:
    df_data_fair = []
    df_data_perf = []
    for classifier_name in classifier_names:
        for folder_name, algorithm_name in zip(folder_names, algorithm_names):
            for i in range(0, 10):
                if os.path.exists(os.path.join(main_path, f'{algorithm_name}_{dataset_name}_{classifier_name}', folder_name, f'fairness_{i}.csv')):
                    fair_path = os.path.join(main_path, f'{algorithm_name}_{dataset_name}_{classifier_name}', folder_name, f'fairness_{i}.csv')
                    perf_path = os.path.join(main_path, f'{algorithm_name}_{dataset_name}_{classifier_name}', folder_name, f'performance_{i}.csv')
                    df_fair = pd.read_csv(fair_path)
                    df_performance = pd.read_csv(perf_path)
                    df_fair['classifier'] = [classifier_name] * len(df_fair)
                    df_fair['algorithm'] = [f'{algorithm_name}' if 'fair' in x else '-' for x in df_fair['data']]
                    df_fair['iteration'] = [i] * len(df_fair)
                    # df_fair.drop(columns=['data', 'average_odds', 'average_absolute_odds'], inplace=True)
                    df_fair.drop(columns=['data'], inplace=True)
                    df_performance['classifier'] = [classifier_name] * len(df_performance)
                    df_performance['algorithm'] = [f'{algorithm_name}' if 'fair' in x else '-' for x in df_performance['data']]
                    df_performance['iteration'] = [i] * len(df_performance)
                    df_performance.drop(columns=['data'], inplace=True)
                    df_data_fair.append(df_fair)
                    df_data_perf.append(df_performance)
    dfs_fairness.append(df_data_fair)
    dfs_performance.append(df_data_perf)
                

['heart_disease']


In [9]:
for dataset_name, dfs in zip(dataset_names, dfs_fairness):
    df = aggregate_runs(dfs, drop_columns=['accuracy', 'disparate_impact', 'average_odds', 'adapted_disparate_impact'])
    print(dataset_name)
    display(df)
    print('----------------------------------------------------------------')

heart_disease


Unnamed: 0_level_0,Unnamed: 1_level_0,statistical_parity,statistical_parity,equal_opportunity,equal_opportunity,average_absolute_odds,average_absolute_odds
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
classifier,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
decision_tree,-,0.187026,0.070335,0.138645,0.031341,0.103817,0.022661
decision_tree,fair_rbh,0.168719,0.112431,0.148718,0.087703,0.115297,0.063776
decision_tree,fair_rbo,0.26636,0.083231,0.191392,0.103971,0.159094,0.075241
decision_tree,fawos,0.20455,0.103114,0.156136,0.090918,0.123378,0.072907
decision_tree,fos,0.301848,0.095786,0.194322,0.098854,0.188554,0.097012
decision_tree,hfos,0.147467,0.096148,0.092582,0.065081,0.113232,0.044032
logistic_regression,-,0.309201,0.030572,0.078297,0.052041,0.12443,0.022497
logistic_regression,fair_rbh,0.217744,0.020793,0.083516,0.083867,0.072833,0.04876
logistic_regression,fair_rbo,0.242682,0.067479,0.078205,0.055904,0.097688,0.041572
logistic_regression,fawos,0.266995,0.038424,0.109066,0.062138,0.122087,0.031306


----------------------------------------------------------------


In [10]:
for dataset_name, dfs_f, dfs_p in zip(dataset_names, dfs_fairness, dfs_performance):
    df = aggregate_runs(dfs_p, drop_columns=['balanced_accuracy',])
    print(dataset_name)
    display(df)
    print('----------------------------------------------------------------')

heart_disease


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,accuracy,f1,f1,gmean,gmean
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
classifier,algorithm,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
decision_tree,-,0.759088,0.034672,0.739812,0.043781,0.75617,0.03682
decision_tree,fair_rbh,0.741611,0.017079,0.691875,0.034463,0.725346,0.025653
decision_tree,fair_rbo,0.74006,0.066977,0.708587,0.08694,0.7338,0.073149
decision_tree,fawos,0.74162,0.035152,0.716344,0.042306,0.737709,0.036878
decision_tree,fos,0.729476,0.020329,0.710089,0.017434,0.728202,0.019153
decision_tree,hfos,0.745529,0.047335,0.726773,0.063001,0.742586,0.051612
logistic_regression,-,0.811555,0.0196,0.790592,0.027208,0.806881,0.02184
logistic_regression,fair_rbh,0.816905,0.023061,0.793091,0.030962,0.810654,0.025605
logistic_regression,fair_rbo,0.804852,0.010462,0.787574,0.017107,0.801438,0.010252
logistic_regression,fawos,0.791402,0.024965,0.775312,0.027505,0.789474,0.024259


----------------------------------------------------------------


In [10]:
for dataset_name, dfs_f, dfs_p in zip(dataset_names, dfs_fairness, dfs_performance):
    print(dataset_name)
    print_to_latex_c(dfs_f, dfs_p, drop_columns_f=['accuracy', 'disparate_impact', 'average_odds', ], drop_columns_p=['balanced_accuracy',])
    print('----------------------------------------------------------------')

german
\begin{tabular}{llllllll}
\toprule
 &  & SPD & EOD & AAO & Accuracy & F1 & G-mean \\
 &  & mean & mean & mean & mean & mean & mean \\
classifier & algorithm &  &  &  &  &  &  \\
\midrule
\multirow[t]{5}{*}{Decision Tree} & - & 0.1233 (±0.035) & 0.1551 (±0.073) & 0.1319 (±0.031) & 0.6560 (±0.011) & 0.7511 (±0.008) & 0.5817 (±0.014) \\
 & FAWOS & 0.1031 (±0.032) & 0.1155 (±0.093) & 0.1252 (±0.039) & 0.6616 (±0.019) & 0.7554 (±0.018) & 0.5868 (±0.019) \\
 & Fair-RBH & 0.1217 (±0.047) & 0.1657 (±0.088) & 0.1744 (±0.080) & 0.6024 (±0.018) & 0.6810 (±0.021) & 0.5979 (±0.021) \\
 & Fair-RBO & 0.1954 (±0.085) & 0.1972 (±0.066) & 0.2062 (±0.079) & 0.5704 (±0.012) & 0.6871 (±0.012) & 0.4694 (±0.024) \\
 & HFOS & 0.0942 (±0.053) & 0.1353 (±0.064) & 0.1128 (±0.035) & 0.6384 (±0.021) & 0.7317 (±0.019) & 0.5831 (±0.022) \\
\cline{1-8}
\multirow[t]{5}{*}{Logistic Regression} & - & 0.1332 (±0.035) & 0.1024 (±0.057) & 0.1619 (±0.053) & 0.7224 (±0.015) & 0.8166 (±0.015) & 0.5491 (±0.021) \\
 & FA

In [29]:
for dataset_name, dfs in zip(dataset_names, dfs_fairness):
     df = pd.concat(dfs).reset_index(drop=True)
     print(dataset_name)
     display(calculate_test_fair(df, 'fair_rbh'))

heart_disease


Unnamed: 0,classifier,algorithm,statistical_parity,average_absolute_odds,equal_opportunity
0,decision_tree,-,=,=,=
1,decision_tree,fos,=,=,=
2,decision_tree,hfos,=,=,=
3,decision_tree,fawos,=,=,=
4,decision_tree,fair_rbo,=,=,=
5,mlp,-,=,=,=
6,mlp,fos,=,+,=
7,mlp,hfos,-,=,=
8,mlp,fawos,=,=,=
9,mlp,fair_rbo,=,=,=


In [30]:
for dataset_name, dfs in zip(dataset_names, dfs_performance):
     df = pd.concat(dfs).reset_index(drop=True)
     print(dataset_name)
     display(calculate_test_pred(df, 'fair_rbh'))

heart_disease


Unnamed: 0,classifier,algorithm,accuracy,f1,gmean
0,decision_tree,-,=,=,=
1,decision_tree,fos,=,=,=
2,decision_tree,hfos,=,=,=
3,decision_tree,fawos,=,=,=
4,decision_tree,fair_rbo,=,=,=
5,mlp,-,+,+,+
6,mlp,fos,+,=,+
7,mlp,hfos,=,=,=
8,mlp,fawos,+,=,+
9,mlp,fair_rbo,=,=,=


In [None]:

for dataset_name, dfs in zip(dataset_names, dfs_performance):
    print(dataset_name)
    print_to_latex(dfs)
    print('----------------------------------------------------------------')

In [None]:
for dataset_name, dfs in zip(dataset_names, dfs_fairness):
    print(dataset_name)
    aggregate_runs_no_mean(dfs, y='average_absolute_odds')

In [None]:
for dataset_name, dfs in zip(dataset_names, dfs_performance):
    print(dataset_name)
    aggregate_runs_no_mean(dfs, y='gmean')