In [1]:
import pandas as pd
import os
import numpy as np
from scipy import stats
import math

pd.options.mode.chained_assignment = None

NUM_TTESTS = 15
path = "results/output"

# Read all json files in the output folder
results = {}
for file in os.listdir(path):
    results[file[:-5]]=pd.read_json(os.path.join(path,file), orient="records")
results.keys()

dict_keys(['iclr_adult_antag_ds_rl_di', 'iclr_adult_antag_ds_rl_dp', 'iclr_adult_antag_ds_rl_eodds', 'iclr_adult_antag_ds_rl_eopp', 'iclr_adult_antag_ds_rl_pe', 'iclr_adult_fixed_ds_rl_di', 'iclr_adult_fixed_ds_rl_dp', 'iclr_adult_fixed_ds_rl_eodds', 'iclr_adult_fixed_ds_rl_eopp', 'iclr_adult_fixed_ds_rl_pe', 'iclr_brazil_antag_ds_rl_di', 'iclr_brazil_antag_ds_rl_dp', 'iclr_brazil_antag_ds_rl_eodds', 'iclr_brazil_antag_ds_rl_eopp', 'iclr_brazil_antag_ds_rl_pe', 'iclr_brazil_fixed_ds_rl_dp', 'iclr_brazil_fixed_ds_rl_eodds', 'iclr_brazil_fixed_ds_rl_eopp', 'iclr_brazil_fixed_ds_rl_pe'])

In [2]:
fc_map = {
    "di": "Disparate Impact",
    "dp": "Demographic Parity",
    "eodds": "Equalized Odds",
    "eopp": "Equal Opportunity",
    "pe": "Predictive Equality"
    }

model_map = {
    'SC'              : 'SC',
    'QSC'             : 'Quasi-SC',
    'QSRC'            : 'Shifty',
    'FairlearnSVC'    : 'Fairlearn',
    'FairConst'       : 'FairConst',
    'FairRobust'      : 'RFLearn'
}

def preprocess(df):
    df = df.loc[df["name"] != "SRC"]
    df["name"]=df["name"].map(model_map)
    return df

def aggragate_table(df):
    df_large = df.loc[df["n_train"] == 60000][["name", "original_nsf", "antagonist_acc_mean", "antagonist_failed_mean"]].reset_index(drop=True)
    df_large["Delta Acc"] = df_large["antagonist_acc_mean"].reset_index(drop=True) - df.loc[df["n_train"] == 10000]["antagonist_acc_mean"].reset_index(drop=True)
    df_large.set_index("name", inplace=True)
    df_large.index.name = None
    return df_large

In [11]:
def t_test(x1,x2, s1,s2,n1,n2):
    t = (x1-x2)/np.sqrt(s1**2/n1 + s2**2/n2)
    return t

def p_value(t, df):
    p = 1 - stats.t.cdf(t, df) # one-sided
    return p * NUM_TTESTS

def get_stats(df):
    df_shifty = df.loc[(df["name"] =="Shifty")&(df["n_train"]==60000)]
    df_max = df[(df["name"] !="Shifty")&(df["n_train"]==60000)]
    idmax = df_max.sort_values(by=["antagonist_acc_mean"], ascending=False)["antagonist_acc_mean"].idxmax()
    df_max = df_max.loc[idmax]
    # print(df_max["name"])

    n1 = df_shifty["antagonist_acc_count"].item()
    n2 = df_max["antagonist_acc_count"].item()

    x1 = df_shifty["antagonist_acc_mean"].item()
    x2 = df_max["antagonist_acc_mean"].item()

    s1 = df_shifty["antagonist_acc_std"].item()
    s2 = df_max["antagonist_acc_std"].item()

    print(x1,x2, s1,s2,n1,n2)

    doff = n1+n2-2
    if math.isnan(doff):
        return -1, None, None
    doff = int(doff)

    t = t_test(x1,x2, s1,s2,n1,n2)
    p = 2*(1-stats.t.cdf(abs(t), doff))
    print(p)

    return p, t, doff

def get_footnote(fixed_stats, antag_stats):
    if not fixed_stats[0] and not antag_stats[0]:
        return None
    if fixed_stats[0] > 0.05 and antag_stats[0] > 0.05:
        return None
    footnote = ""
    significant = False
    insufficient = False
    if fixed_stats[0] < 0.05:
        if fixed_stats[0] == -1:
            insufficient = True
        else:
            significant = True
            if fixed_stats[0] < 0.001:
                footnote += " Fixed Bounds: $p<0.001$, $t={:.3f}$, $df={}$.".format(fixed_stats[1], fixed_stats[2])
            else:
                footnote += " Fixed Bounds: $p={:.3f}$, $t={:.3f}$, $df={}$.".format(fixed_stats[0], fixed_stats[1], fixed_stats[2])
    if antag_stats[0] < 0.05:
        if antag_stats[0] == -1:
            insufficient = True
        else:
            significant = True
            if antag_stats[0] < 0.001:
                footnote += " Unknown Bounds: $p<0.001$, $t={:.3f}$, $df={}$.".format(antag_stats[1], antag_stats[2])
            else:
                footnote += " Unknown Bounds: $p={:.3f}$, $t={:.3f}$, $df={}$.".format(antag_stats[0], antag_stats[1], antag_stats[2])

    if significant:
        footnote = "*significantly worse than \\textbf{best model}." + footnote
    if insufficient:
        footnote += "°insufficient number of solutions to perform t-test"

    footnote = "{\\raggedright " + footnote + " \par}"

    return footnote
     

In [12]:
for dataset in ["adult", "brazil"]:
    for fc in ["di", "dp", "eodds", "eopp", "pe"]:
        fixed = results[f"iclr_{dataset}_fixed_ds_rl_{fc}"]
        antag = results[f"iclr_{dataset}_antag_ds_rl_{fc}"]
        fixed = preprocess(fixed)
        antag = preprocess(antag)
        fixed_stats = get_stats(fixed)
        antag_stats = get_stats(antag)
        fixed = aggragate_table(fixed)
        antag = aggragate_table(antag)
        combined = pd.concat({"Fixed Bounds": fixed, "Unknown Bounds": antag}, axis=1)
        combined.rename(columns={"original_nsf": "NSF", "antagonist_acc_mean": "Acc", "antagonist_failed_mean": "FR"}, inplace=True)
        latex = combined.style.format(
            na_rep="-", precision=3
            ).highlight_max(
                props="font-weight:bold", subset=[("Fixed Bounds", "Acc"), ("Unknown Bounds", "Acc")]
                ).highlight_min(
                    props="font-weight:bold", subset=[("Fixed Bounds", "FR"), ("Unknown Bounds", "FR")]
                    ).to_latex(
                        label=f"{fc}_{dataset}", caption=f"{fc_map[fc]} - {dataset} dataset", position="H", hrules=True, convert_css=True, multicol_align="c") # header = ["NSF", "acc orig", "acc depl"]
        latex = latex.replace("\midrule", "\cmidrule(r){2-5} \cmidrule{6-9}")
        footnote = get_footnote(fixed_stats, antag_stats)
        if footnote:
            latex = latex.replace("\end{table}", footnote + "\n\end{table}")
        print(latex)
        print()

nan 0.7857490300000001 nan 0.0014848716 nan 25.0
nan 0.8408337065 nan 0.0103867373 nan 16.0
\begin{table}[H]
\caption{Disparate Impact - adult dataset}
\label{di_adult}
\begin{tabular}{lrrrrrrrr}
\toprule
 & \multicolumn{4}{c}{Fixed Bounds} & \multicolumn{4}{c}{Unknown Bounds} \\
 & NSF & Acc & FR & Delta Acc & NSF & Acc & FR & Delta Acc \\
\cmidrule(r){2-5} \cmidrule{6-9}
FairConst & 0 & 0.781 & \bfseries 1.000 & -0.003 & 0 & 0.791 & \bfseries 1.000 & -0.030 \\
RFLearn & 0 & \bfseries 0.786 & \bfseries 1.000 & 0.000 & 0 & 0.826 & \bfseries 1.000 & -0.002 \\
Fairlearn & 0 & 0.782 & \bfseries 1.000 & 0.000 & 0 & \bfseries 0.841 & \bfseries 1.000 & 0.003 \\
Quasi-SC & 1 & - & - & - & 1 & - & - & - \\
Shifty & 1 & - & - & - & 1 & - & - & - \\
SC & 1 & - & - & - & 1 & - & - & - \\
\bottomrule
\end{tabular}
{\raggedright °insufficient number of solutions to perform t-test \par}
\end{table}


0.7558572755 0.7857243654 0.0074668123 0.0015871435 13.0 25.0
2.220446049250313e-16
0.7798652913 0.7

KeyError: 'iclr_brazil_fixed_ds_rl_di'