In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from pingouin import wilcoxon

os.makedirs('stats', exist_ok=True)
os.makedirs('imgs', exist_ok=True)
sns.set_theme(style="whitegrid")


In [18]:
def process_results(df):
    df[['statistical_parity', 'equal_opportunity', 'average_odds']] = df[['statistical_parity', 'equal_opportunity', 'average_odds']].abs()
    grouped = df.groupby(['variable', 'fold']).agg('mean').reset_index()
    return grouped

In [19]:
os.makedirs('grouped', exist_ok=True)
for file in os.listdir('results'):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join('results', file), index_col=0)
        grouped_df = process_results(df)
        grouped_df.to_csv(f"results/{file}")
        print(f"Processed {file}:")
        print(grouped_df.head())

Processed transformed_bank_default_add_noise_rf.csv:
  variable  fold  statistical_parity  equal_opportunity  average_odds
0      age     0            0.213237           0.157446      0.085196
1      age     1            0.228311           0.104517      0.078790
2      age     2            0.236424           0.144051      0.093749
3      age     3            0.230739           0.094268      0.092470
4      age     4            0.238206           0.161487      0.086730
Processed transformed_bank_poutcome_add_noise_rf.csv:
  variable  fold  statistical_parity  equal_opportunity  average_odds
0      age     0            0.207770           0.147127      0.082605
1      age     1            0.254228           0.095968      0.070946
2      age     2            0.231442           0.138892      0.105436
3      age     3            0.244155           0.107038      0.091112
4      age     4            0.242160           0.170953      0.106742
Processed transformed_bank_month_add_noise_rf.csv:
  

In [20]:
def make_plot(age_trans, base_age, feature_name, model_name, transformation):
  fig, ax = plt.subplots(1, 2, figsize=(12, 6), sharey=True)

  sns.boxplot(data=age_trans[['statistical_parity', 'equal_opportunity', 'average_odds']], ax=ax[0])
  sns.boxplot(data=base_age[['statistical_parity', 'equal_opportunity', 'average_odds']], ax=ax[1])
  ax[0].set_xticklabels(['Statistical Parity', 'Equal Opportunity', 'Average Odds'])
  ax[1].set_xticklabels(['Statistical Parity', 'Equal Opportunity', 'Average Odds'])
  ax[0].set_title(f'Transformed feature')
  ax[1].set_title('Base Dataset')
  plt.savefig(f'imgs/{feature_name}_{model_name}_feature_{transformation}.pdf', bbox_inches='tight')
  plt.show()

In [21]:
def wilcoxon_test(base_age, age, feature_name, model_name, transformation):
    stats_two_sided = pd.DataFrame()
    stats_greater = pd.DataFrame()
    os.makedirs('stats_two_sided', exist_ok=True)
    os.makedirs('stats_greater', exist_ok=True)
    for metric in ['statistical_parity', 'equal_opportunity', 'average_odds']:
        print(f"Wilcoxon test for {metric}:")
        test_two_sided = wilcoxon(base_age[metric], age[metric], alternative='two-sided')
        test_two_sided['metric'] = metric
        stats_two_sided = pd.concat([stats_two_sided, test_two_sided], ignore_index=True)

        test_greater = wilcoxon(base_age[metric], age[metric], alternative='less')
        test_greater['metric'] = metric
        stats_greater = pd.concat([stats_greater, test_greater], ignore_index=True)

    stats_two_sided[['p-val', 'CLES', 'metric']].to_csv(f'stats_two_sided/{feature_name}_{model_name}_{transformation}_wilcoxon.csv', index=False)
    stats_greater[['p-val', 'CLES', 'metric']].to_csv(f'stats_greater/{feature_name}_{model_name}_{transformation}_wilcoxon.csv', index=False)

In [22]:
base_results = ['bank_rf.csv', 'bank_xgb.csv']

for ris in os.listdir('results'):
    if ris in base_results:
        continue
    df = pd.read_csv(os.path.join('results', ris), index_col=0)
    if '_rf' in ris:
        model = 'rf'
        base = pd.read_csv('results/bank_rf.csv', index_col=0)
    else:
        model = 'xgb'
        base = pd.read_csv('results/bank_xgb.csv', index_col=0)

    feature = ris.split('_')[2]
    transformation = ris.split('_')[3].replace('.csv', '')
    print(f"Analyzing feature: {feature}")
    df = df[df['variable'] == feature]
    base = base[base['variable'] == feature]
    # make_plot(df, base, feature, model, transformation)
    wilcoxon_test(base, df, feature, model, transformation)


Analyzing feature: default
Wilcoxon test for statistical_parity:
Wilcoxon test for equal_opportunity:
Wilcoxon test for average_odds:
Analyzing feature: poutcome
Wilcoxon test for statistical_parity:
Wilcoxon test for equal_opportunity:
Wilcoxon test for average_odds:
Analyzing feature: month
Wilcoxon test for statistical_parity:
Wilcoxon test for equal_opportunity:
Wilcoxon test for average_odds:
Analyzing feature: default
Wilcoxon test for statistical_parity:
Wilcoxon test for equal_opportunity:
Wilcoxon test for average_odds:
Analyzing feature: previous
Wilcoxon test for statistical_parity:
Wilcoxon test for equal_opportunity:
Wilcoxon test for average_odds:
Analyzing feature: marital
Wilcoxon test for statistical_parity:
Wilcoxon test for equal_opportunity:
Wilcoxon test for average_odds:
Analyzing feature: month
Wilcoxon test for statistical_parity:
Wilcoxon test for equal_opportunity:
Wilcoxon test for average_odds:
Analyzing feature: poutcome
Wilcoxon test for statistical_parity

## Table setup

In [2]:
full_table = pd.DataFrame()

for stats in os.listdir('stats_two_sided'):
    if 'rf_' in stats:
        df_stats = pd.read_csv(os.path.join('stats_two_sided', stats))
        feature = stats.split('_')[0]
        df_stats = df_stats.pivot_table(columns='metric', values='p-val')
        df_stats['feature'] = feature
        full_table = pd.concat([full_table, df_stats], axis=0)


In [3]:
full_table.columns = ['AOD', 'EOD', 'SP', 'Feature']

In [4]:
full_table = full_table[['Feature', 'SP', 'AOD', 'EOD']]

In [5]:
full_table.to_latex('table_rf.tex', index=False, float_format="%.3f")

In [6]:
full_table = pd.DataFrame()

for stats in os.listdir('stats_two_sided'):
    if 'xgb_' in stats:
        df_stats = pd.read_csv(os.path.join('stats_two_sided', stats))
        feature = stats.split('_')[0]
        df_stats = df_stats.pivot_table(columns='metric', values='p-val')
        df_stats['feature'] = feature
        full_table = pd.concat([full_table, df_stats], axis=0)

full_table.columns = ['AOD', 'EOD', 'SP', 'Feature']
full_table = full_table[['Feature', 'SP', 'AOD', 'EOD']]
full_table.to_latex('table_xgb.tex', index=False, float_format="%.3f")