In [1]:
import os 
from os.path import join, dirname
import pickle
import pandas as pd
import numpy as np

from tqdm import tqdm 

import sys
sys.path.append(dirname(os.getcwd()))

pd.set_option('max_colwidth', 100) 

In [2]:
'''Utility functions for computing results statistics'''
def ybar0_cnd_y1(cmat):  
    return (cmat[1][0]/(cmat[1][0] + cmat[1][1]))
def ybar1_cnd_y0(cmat): 
    return (cmat[0][1]/(cmat[0][1] + cmat[0][0]))

def ybar1(cmat):
    return (cmat[0][1] + cmat[1][1]) / (cmat[0][0] + cmat[0][1] + cmat[1][0] + cmat[1][1])

def false_neg(raw):
    return abs(ybar0_cnd_y1(raw['sa_0']['ood'][0]) - ybar0_cnd_y1(raw['sa_1']['ood'][0])) * 100

def false_pos(raw):
    return abs(ybar1_cnd_y0(raw['sa_0']['ood'][0]) - ybar1_cnd_y0(raw['sa_1']['ood'][0])) * 100

def pos_pred(raw):
    return abs(ybar1(raw['sa_0']['ood'][0]) - ybar1(raw['sa_1']['ood'][0])) * 100

# In future replace 'test' with 'ood' 

In [3]:
def gen_final_table(df, ind_cols, res_cols, reindex={}):
    '''df = the final dataframe to be analyzed
       reindex= dict of {col_name, order}'''
    
    #Reorder stuff for the groupby 
    for c, order in reindex.items(): 
        category = pd.api.types.CategoricalDtype(categories=order, ordered=True)
        df[c] = df[c].astype(category)
    
    df_mean = df.groupby(ind_cols)[res_cols].mean()
    df_std = df.groupby(ind_cols)[res_cols].std()
    df_full = df_mean.applymap(lambda x: "{0:01.1f}".format(x)) + df_std.applymap(lambda x: '  {}  '.format('- ') +"{0:01.1f}".format(x))
    
    return df_full


# Load Results  

In [14]:
'Loads all the results into a central dataframe'

resdir = '/YOUR/RESULTS/DIR/HERE' 
resdf = []

for f in tqdm(os.listdir(resdir)): 
    if f.split('_')[0].isdigit() and f.split('_')[1] == 'baseline.pkl':  
#       Load results data into memory
        try:
            base_data = pickle.load(open(join(resdir, f), 'rb'))
            trt_data = pickle.load(open(join(resdir, '{}_{}'.format(f.split('_')[0], 'irm.pkl')), 'rb'))
        except:
            raise Exception('One or more results files for run is broken')
        
        #Construct entry in dataframe 
        entry = [base_data['id']['params']['seed'], base_data['id']['params']['env_id'], \
                  base_data['id']['params']['sens_att'], base_data['id']['params']['word_encoding'], \
                 base_data['id']['params']['tox_thresh'], base_data['id']['params']['explicit_sa'], \
                 (np.mean(base_data['results']['ACC']['train'])*100), (np.mean(base_data['results']['ACC']['val'])*100), \
                 (base_data['results']['ACC']['test']*100)]
        entry += [(np.mean(trt_data['results']['ACC']['train'])*100), (np.mean(trt_data['results']['ACC']['val'])*100), (trt_data['results']['ACC']['test']*100)]
        entry += [f.split('_')[0]]
        resdf.append(entry)

# Format the dataframe
resdf = pd.DataFrame(resdf)
resdf.columns = ['seed', 'env_splits', 'SA', 'word_encoding', 'tox_thresh', 'explicit_sa', \
                 'base_train', 'IOD_ERM', 'OOD_ERM', 'irm_train', 'IOD_IRM', 'OOD_IRM', 'id']
resdf['SA'] = resdf['SA'].apply(lambda x: 'LGBTQ' if x == 'new_LGBTQ' else ('Black' if x == 'black' else ('Muslim' if x == 'muslim' else ('NeuroDiv' if x == 'mental' else x))))                                 
resdf['word_encoding'] = resdf['word_encoding'].apply(lambda x: 'EmbedMean' if x == 'embed_mean' else ('EmbedSum' if x == 'embed_sum' else x))

groupby_cols = ['SA', 'env_splits', 'word_encoding', 'tox_thresh']
test_mean = resdf.groupby(groupby_cols)['base_train', 'IOD_ERM', 'OOD_ERM', 'irm_train', 'IOD_IRM', 'OOD_IRM'].mean()
test_std = resdf.groupby(groupby_cols)['base_train', 'IOD_ERM', 'OOD_ERM', 'irm_train', 'IOD_IRM', 'OOD_IRM'].std()
assert (set(test_mean.columns) ^ set(test_mean.columns) == set()) and  (set(test_std.columns) ^ set(test_std.columns) == set())
test_full = test_mean.applymap(lambda x: "{0:01.1f}".format(x)) + test_std.applymap(lambda x: '  {}  '.format('- ') +"{0:01.1f}".format(x))

100%|██████████| 487/487 [00:00<00:00, 5925.17it/s]
  test_mean = resdf.groupby(groupby_cols)['base_train', 'IOD_ERM', 'OOD_ERM', 'irm_train', 'IOD_IRM', 'OOD_IRM'].mean()
  test_std = resdf.groupby(groupby_cols)['base_train', 'IOD_ERM', 'OOD_ERM', 'irm_train', 'IOD_IRM', 'OOD_IRM'].std()


# Generate Table 1

In [15]:
# Make dataframe copy  
single = resdf.copy()
single = single[(single['word_encoding'] == 'sbert') & (single['explicit_sa'] == 0)]
    
#Generate table
single_groupby_cols = ['SA']
single_res_cols = ['IOD_ERM', 'IOD_IRM', 'OOD_ERM', 'OOD_IRM']
single_full = gen_final_table(single, single_groupby_cols, single_res_cols, {'SA': ['Black', 'Muslim', 'LGBTQ', 'NeuroDiv']})

MULTINDEX = True
if MULTINDEX:
    single_full.columns = pd.MultiIndex.from_product([['IOD', 'OOD'], ['ERM', 'IRM'],])

single_full.head(100)

Unnamed: 0_level_0,IOD,IOD,OOD,OOD
Unnamed: 0_level_1,ERM,IRM,ERM,IRM
SA,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Black,85.5 - 0.5,79.9 - 1.3,48.8 - 0.6,58.5 - 2.7
Muslim,85.0 - 0.6,79.0 - 2.9,48.8 - 0.6,60.9 - 2.1
LGBTQ,84.0 - 0.4,79.9 - 2.0,55.4 - 0.6,61.1 - 3.8
NeuroDiv,82.5 - 1.2,79.9 - 1.8,62.9 - 1.7,60.2 - 1.6


# Generate Table 2

In [16]:
# Make dataframe copy  
single = resdf.copy()
single = single[(single['word_encoding'] == 'sbert') & (single['explicit_sa'] == 0)]


#Generate fairness metrics 
for index, row in single.iterrows():
    base_raw = pickle.load(open(join(resdir, '{}_baseline.pkl'.format(row['id'])), 'rb'))['raw']
    irm_raw = pickle.load(open(join(resdir, '{}_irm.pkl'.format(row['id'])), 'rb'))['raw']

    #Compute vals
    single.loc[index, 'base_dEO'] = 0.5 * (false_pos(base_raw) + false_neg(base_raw))
    single.loc[index, 'irm_dEO'] = 0.5 *(false_pos(irm_raw) + false_neg(irm_raw))
    single.loc[index, 'base_DP'] = pos_pred(base_raw)
    single.loc[index, 'irm_DP'] = pos_pred(irm_raw)

#Generate table
single_groupby_cols = ['SA']
single_res_cols = ['base_dEO', 'irm_dEO', 'base_DP', 'irm_DP']
single_full = gen_final_table(single, single_groupby_cols, single_res_cols, {'SA': ['Black', 'Muslim', 'LGBTQ', 'NeuroDiv']})
MULTINDEX = True
if MULTINDEX:
    single_full.columns = pd.MultiIndex.from_product([['dEO', 'dDP'], ['ERM', 'IRM'],])

single_full.head(100)

Unnamed: 0_level_0,dEO,dEO,dDP,dDP
Unnamed: 0_level_1,ERM,IRM,ERM,IRM
SA,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Black,49.4 - 1.8,29.2 - 4.5,19.6 - 1.3,5.3 - 3.1
Muslim,47.7 - 0.9,24.3 - 4.9,18.9 - 1.1,8.8 - 5.1
LGBTQ,38.8 - 1.7,25.9 - 7.4,5.4 - 0.6,10.6 - 4.8
NeuroDiv,26.8 - 3.3,27.2 - 1.2,11.0 - 3.8,6.5 - 2.9


# Generate Table 3

In [17]:
# Make Dataframe Copy
single = resdf.copy()
single = single[(single['word_encoding'] == 'sbert')]

# Setup Table
single_groupby_cols = ['SA', 'explicit_sa']
single_res_cols = ['OOD_ERM', 'OOD_IRM']

proc_envs = []
for p in [single[single['explicit_sa'] == 0], single[single['explicit_sa'] == 1]]:
    proc_envs.append(gen_final_table(p, single_groupby_cols, single_res_cols, {'SA': ['Black', 'Muslim', 'LGBTQ', 'NeuroDiv']}))
proc_envs[0].columns, proc_envs[1].columns = ['OOD_ERM_ESA0', 'OOD_IRM_ESA0'], ['OOD_ERM_ESA1', 'OOD_IRM_ESA1']
proc_envs[0].index, proc_envs[1].index = proc_envs[0].index.droplevel(level=1), proc_envs[1].index.droplevel(level=1)

single_full = proc_envs[0].join(proc_envs[1], how='outer')
single_full = single_full[['OOD_ERM_ESA0', 'OOD_ERM_ESA1', 'OOD_IRM_ESA0', 'OOD_IRM_ESA1']]
single_full.columns = pd.MultiIndex.from_product([['ERM', 'IRM'], ['non-explicit', 'explicit'],])

single_full.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c].astype(category)


Unnamed: 0_level_0,ERM,ERM,IRM,IRM
Unnamed: 0_level_1,non-explicit,explicit,non-explicit,explicit
SA,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Black,48.8 - 0.6,39.6 - 1.2,58.5 - 2.7,57.6 - 2.1
Muslim,48.8 - 0.6,40.4 - 0.7,60.9 - 2.1,59.4 - 1.8
LGBTQ,55.4 - 0.6,40.9 - 0.6,61.1 - 3.8,58.2 - 3.3
NeuroDiv,62.9 - 1.7,46.5 - 1.2,60.2 - 1.6,56.6 - 2.8


# Generate Table 4

In [18]:
# Make dataframe copy  
single = resdf.copy()
single = single[ ((single['word_encoding'] == 'EmbedSum') | (single['word_encoding'] == 'EmbedMean'))]
single = single[(single['explicit_sa'] == 1)] 

# Setup Table
single_groupby_cols = ['word_encoding']
single_res_cols = ['IOD_ERM', 'IOD_IRM', 'OOD_ERM', 'OOD_IRM']
single_full = gen_final_table(single, single_groupby_cols, single_res_cols, {'word_encoding': ['EmbedSum', 'EmbedMean']})

MULTINDEX = True
if MULTINDEX:
    single_full.columns = pd.MultiIndex.from_product([['IOD', 'OOD'], ['ERM', 'IRM'],])

single_full.head(100)

Unnamed: 0_level_0,IOD,IOD,OOD,OOD
Unnamed: 0_level_1,ERM,IRM,ERM,IRM
word_encoding,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
EmbedSum,84.3 - 0.9,79.7 - 1.4,56.2 - 2.5,62.0 - 2.4
EmbedMean,85.4 - 1.0,49.7 - 1.3,11.9 - 0.4,50.0 - 0.4
