In [1]:
import numpy as np
import json

folder = '/Users/guha/workspace/schelterlabs/CleanML/result-wagyu_20230719'
cardio_age_threshold = 'age@45'
intersectional_formulation = 'pp_dagg'  # ['pp_dd', 'pagg_dd', 'pp_dagg']

with open(f'{folder}/USCensus_result.json') as f:
    adult_data = json.load(f)

with open(f'{folder}/ACSIncome_result.json') as f:
    folk_data = json.load(f)

with open(f'{folder}/Cardio-{cardio_age_threshold}_result.json') as f:
    heart_data = json.load(f)

with open(f'{folder}/GermanCredit_result.json') as f:
    german_data = json.load(f)

In [2]:
from scipy.stats import ttest_rel

def t_test(dirty, clean):
    """Comparing method"""
    def two_tailed_t_test(dirty, clean):
        n_d = len(dirty)
        n_c = len(clean)
        n = min(n_d, n_c)
        t, p = ttest_rel(clean[:n], dirty[:n])
        if np.isnan(t):
            t, p = 0, 1
        return {"t-stats":t, "p-value":p}

    def one_tailed_t_test(dirty, clean, direction):
        two_tail = two_tailed_t_test(dirty, clean)
        t, p_two = two_tail['t-stats'], two_tail['p-value']
        if direction == 'positive':
            if t > 0 :
                p = p_two * 0.5
            else:
                p = 1 - p_two * 0.5
        else:
            if t < 0:
                p = p_two * 0.5
            else:
                p = 1 - p_two * 0.5
        return {"t-stats":t, "p-value":p}

    result = {}
    result['two_tail'] = two_tailed_t_test(dirty, clean)
    result['one_tail_pos'] = one_tailed_t_test(dirty, clean, 'positive')
    result['one_tail_neg'] = one_tailed_t_test(dirty, clean, 'negative')
    return result

In [3]:
from collections import defaultdict

class EmptyGroupException(Exception):
    """
    Custom exception for when an intersectional group is empty,
    and therefore there are no stats in the stats JSON object
    with the corresponding dictionary key.
    """
    
    def __init__(self, repair, criteria, group):
        self.repair = repair
        self.criteria = "/".join(criteria)
        self.group = group
        super(EmptyGroupException, self).__init__(
            f"For cleaning method {repair} and criteria {criteria}, the group {group} is empty.")

def prefix(cleaning_method, criteria, intersectional_group):
    first, second = intersectional_group.split("_")
    return f'{cleaning_method}__{criteria[0]}_{first}__{criteria[1]}_{second}'

def stat_value(stat_name, stats, cleaning_method, criteria, *groups):
    count = 0
    for g in groups:
        stat_key = f"{prefix(cleaning_method, criteria, g)}__{stat_name}"
        try:
            count += stats[stat_key]
        except KeyError:
            raise EmptyGroupException(cleaning_method, criteria, g)
    return count

# Define priv and dis intersectional groups
# e.g. priv = intersectionally priv, i.e. priv_priv
#      dis = intersectionally dis, i.e. dis_dis
# This should be specified at the top of this notebook.
priv, dis = [], []
if intersectional_formulation == "pp_dd":
    priv = ["priv_priv"]
    dis = ["dis_dis"]
elif intersectional_formulation == "pagg_dd":
    priv = ["priv_priv", "priv_dis", "dis_priv"]
    dis = ["dis_dis"]
elif intersectional_formulation == "pp_dagg":
    priv = ["priv_priv"]
    dis = ["priv_dis", "dis_priv", "dis_dis"]

def compute_eo(stats, cleaning_method, criteria, flipped):
    if not flipped:
        priv_tp = stat_value("tp", stats, cleaning_method, criteria, *priv)
        priv_fn = stat_value("fn", stats, cleaning_method, criteria, *priv)
        dis_tp = stat_value("tp", stats, cleaning_method, criteria, *dis)
        dis_fn = stat_value("fn", stats, cleaning_method, criteria, *dis)

        return (priv_tp / (priv_tp + priv_fn)) - \
             (dis_tp / (dis_tp + dis_fn))
    else:
        priv_tn = stat_value("tn", stats, cleaning_method, criteria, *priv)
        priv_fp = stat_value("fp", stats, cleaning_method, criteria, *priv)
        dis_tn = stat_value("tn", stats, cleaning_method, criteria, *dis)
        dis_fp = stat_value("fp", stats, cleaning_method, criteria, *dis)

        return (priv_tn / (priv_tn + priv_fp)) - \
             (dis_tn / (dis_tn + dis_fp))

def compute_pp(stats, cleaning_method, criteria, flipped):
    if not flipped:
        priv_tp = stat_value("tp", stats, cleaning_method, criteria, *priv)
        priv_fp = stat_value("fp", stats, cleaning_method, criteria, *priv)
        dis_tp = stat_value("tp", stats, cleaning_method, criteria, *dis)
        dis_fp = stat_value("fp", stats, cleaning_method, criteria, *dis)

        return (priv_tp / (priv_tp + priv_fp)) - \
             (dis_tp / (dis_tp + dis_fp))
    else:
        priv_tn = stat_value("tn", stats, cleaning_method, criteria, *priv)
        priv_fn = stat_value("fn", stats, cleaning_method, criteria, *priv)
        dis_tn = stat_value("tn", stats, cleaning_method, criteria, *dis)
        dis_fn = stat_value("fn", stats, cleaning_method, criteria, *dis)

        return (priv_tn / (priv_tn + priv_fn)) - \
             (dis_tn / (dis_tn + dis_fn))

def count(data, dataset_name, target_criteria, error_type, model, metric_name, scoring, log_file, flipped=False):
    dirty_scores = []
    dirty_accs = []

    cleaning_scores = {}
    cleaning_accs = {}

    dirty = 'dirty'
    if error_type == 'missing_values':
        dirty = 'delete'

    for experiment in data:
        if error_type in experiment and model in experiment:
            split_seed = experiment.split("/")[1]
            train_method = experiment.split("/")[3]
            retrain_seed = experiment.split("/")[5]

            if train_method == dirty:
                # Missing values need special treatment, just deleting the corresponding rows from the test set
                # is not applicable in real-world scenarios, so we set a default way to treat the test data
                if error_type == 'missing_values':
                    try:
                        score = scoring(data[experiment], 'impute_mean_dummy', target_criteria, flipped)
                    except EmptyGroupException as e:
                        # leave empty intersectional groups out of the analysis
                        continue
                    dirty_scores.append(score)
                    dirty_accs.append(data[experiment]['impute_mean_dummy_test_acc'])
                else:
                    try:
                        score = scoring(data[experiment], dirty, target_criteria, flipped)
                    except EmptyGroupException as e:
                        # leave empty intersectional groups out of the analysis
                        continue
                    dirty_scores.append(score)
                    dirty_accs.append(data[experiment][dirty + '_test_acc'])

            if train_method != dirty:
                for test_method in [dirty, train_method]:
                    approach = (train_method, test_method)

                    if approach not in cleaning_scores:
                        cleaning_scores[approach] = []

                    if error_type == 'mislabel':
                        try:
                            scores = scoring(data[experiment], 'clean', target_criteria, flipped)
                        except EmptyGroupException as e:
                            # leave empty intersectional groups out of the analysis
                            continue
                    else:
                        try:
                            scores = scoring(data[experiment], test_method, target_criteria, flipped)
                        except EmptyGroupException as e:
                            # leave empty intersectional groups out of the analysis
                            continue
                    cleaning_scores[approach].append(scores)

                    if approach not in cleaning_accs:
                        cleaning_accs[approach] = []

                    if test_method == dirty:
                        cleaning_accs[approach].append(data[experiment][f'{dirty}_test_acc'])
                    else:
                        if error_type != 'mislabel':
                            cleaning_accs[approach].append(data[experiment][f'{train_method}_test_acc'])

    evaluate_scores(dirty_scores, cleaning_scores, dirty_accs, cleaning_accs,
                    dataset_name, target_criteria, metric_name, model, error_type, log_file)

In [4]:
def evaluate_scores(dirty_scores, cleaning_scores, dirty_accs, cleaning_accs,
                    dataset_name, target_criteria, metric_name, model, error_type, log_file):
    if len(cleaning_scores) > 0:
        # bonferroni correction
        alpha = 0.05 / len(cleaning_scores)

        for method, scores in cleaning_scores.items():
            test_results = t_test(dirty_scores, scores)

            repair_train, repair_clean = method
            test_repaired = repair_train == repair_clean

            difference = 'insignificant'

            if test_results['two_tail']['p-value'] < alpha:
                if test_results['one_tail_neg']['p-value'] < alpha:
                    difference = 'positive'
                if test_results['one_tail_pos']['p-value'] < alpha:
                    difference = 'negative'

            acc_test_results = t_test(dirty_accs, cleaning_accs[method])

            acc_difference = 'insignificant'

            if acc_test_results['two_tail']['p-value'] < alpha:
                if acc_test_results['one_tail_neg']['p-value'] < alpha:
                    acc_difference = 'negative'
                if acc_test_results['one_tail_pos']['p-value'] < alpha:
                    acc_difference = 'positive'

            if error_type == 'missing_values':
                repair_method = repair_train
                detection=''
            elif error_type == 'mislabel':
                repair_method = "flip"
                detection = repair_train
            else:
                tokens = repair_train.split("_impute")
                detection = tokens[0].replace('clean_', '')
                repair_method = 'impute_' + tokens[1]

            if not (error_type == 'mislabel' and (test_repaired or detection == 'shapley')):
                line = f'{dataset_name},{"/".join(target_criteria)},{metric_name},{model},{error_type},{detection},{repair_method},{test_repaired},{difference},{acc_difference}'

                if test_repaired or error_type == 'mislabel':
                    print(line)
                    log_file.write(f'{line}\n')

In [5]:
errors = ['outliers', 'missing_values', 'mislabel']
models = ['logistic_regression', 'knn_classification', 'XGBoost']
metrics = [('equal_opportunity', compute_eo), ('predictive_parity', compute_pp)]

results_filename = f'cleanml_intersectional-{cardio_age_threshold}-{intersectional_formulation}.csv'

with open(results_filename, 'w') as log_file:
    log_file.write('dataset,criteria,metric,model,error,detection,repair,test_repaired,fairness_impact,accuracy_impact\n')

    for metric, scoring in metrics:
        for error in errors:
            for model in models:
                count(adult_data, 'adult', ('sex', 'race'), error, model, metric, scoring, log_file)
                count(folk_data, 'folktables', ('sex', 'rac1p'), error, model, metric, scoring, log_file)
                count(heart_data, 'heart', ('gender', 'age'), error, model, metric, scoring, log_file, flipped=False)
                count(german_data, 'german', ('age', 'sex'), error, model, metric, scoring, log_file, flipped=True)
                count(german_data, 'german', ('age', 'foreign_worker'), error, model, metric, scoring, log_file, flipped=True)
                count(german_data, 'german', ('sex', 'foreign_worker'), error, model, metric, scoring, log_file, flipped=True)

adult,sex/race,equal_opportunity,logistic_regression,outliers,SD,impute__mean_dummy,True,insignificant,positive
adult,sex/race,equal_opportunity,logistic_regression,outliers,SD,impute__mode_dummy,True,positive,positive
adult,sex/race,equal_opportunity,logistic_regression,outliers,SD,impute__median_dummy,True,positive,positive
adult,sex/race,equal_opportunity,logistic_regression,outliers,IQR,impute__mean_dummy,True,negative,positive
adult,sex/race,equal_opportunity,logistic_regression,outliers,IQR,impute__mode_dummy,True,negative,insignificant
adult,sex/race,equal_opportunity,logistic_regression,outliers,IQR,impute__median_dummy,True,negative,insignificant
adult,sex/race,equal_opportunity,logistic_regression,outliers,IF,impute__mean_dummy,True,negative,positive
adult,sex/race,equal_opportunity,logistic_regression,outliers,IF,impute__mode_dummy,True,insignificant,positive
adult,sex/race,equal_opportunity,logistic_regression,outliers,IF,impute__median_dummy,True,insignificant,positive
fol

german,age/foreign_worker,equal_opportunity,knn_classification,outliers,SD,impute__mean_dummy,True,negative,insignificant
german,age/foreign_worker,equal_opportunity,knn_classification,outliers,SD,impute__mode_dummy,True,insignificant,insignificant
german,age/foreign_worker,equal_opportunity,knn_classification,outliers,SD,impute__median_dummy,True,insignificant,insignificant
german,age/foreign_worker,equal_opportunity,knn_classification,outliers,IQR,impute__mean_dummy,True,insignificant,insignificant
german,age/foreign_worker,equal_opportunity,knn_classification,outliers,IQR,impute__mode_dummy,True,insignificant,negative
german,age/foreign_worker,equal_opportunity,knn_classification,outliers,IQR,impute__median_dummy,True,insignificant,insignificant
german,age/foreign_worker,equal_opportunity,knn_classification,outliers,IF,impute__mean_dummy,True,insignificant,positive
german,age/foreign_worker,equal_opportunity,knn_classification,outliers,IF,impute__mode_dummy,True,insignificant,insign

german,sex/foreign_worker,equal_opportunity,logistic_regression,missing_values,,impute_mean_mode,True,insignificant,insignificant
german,sex/foreign_worker,equal_opportunity,logistic_regression,missing_values,,impute_mean_dummy,True,insignificant,insignificant
german,sex/foreign_worker,equal_opportunity,logistic_regression,missing_values,,impute_median_mode,True,insignificant,insignificant
german,sex/foreign_worker,equal_opportunity,logistic_regression,missing_values,,impute_median_dummy,True,insignificant,insignificant
german,sex/foreign_worker,equal_opportunity,logistic_regression,missing_values,,impute_mode_mode,True,insignificant,insignificant
german,sex/foreign_worker,equal_opportunity,logistic_regression,missing_values,,impute_mode_dummy,True,insignificant,insignificant
adult,sex/race,equal_opportunity,knn_classification,missing_values,,impute_mean_mode,True,negative,positive
adult,sex/race,equal_opportunity,knn_classification,missing_values,,impute_mean_dummy,True,negative,posit

adult,sex/race,predictive_parity,logistic_regression,outliers,SD,impute__mean_dummy,True,positive,positive
adult,sex/race,predictive_parity,logistic_regression,outliers,SD,impute__mode_dummy,True,positive,positive
adult,sex/race,predictive_parity,logistic_regression,outliers,SD,impute__median_dummy,True,positive,positive
adult,sex/race,predictive_parity,logistic_regression,outliers,IQR,impute__mean_dummy,True,positive,positive
adult,sex/race,predictive_parity,logistic_regression,outliers,IQR,impute__mode_dummy,True,positive,insignificant
adult,sex/race,predictive_parity,logistic_regression,outliers,IQR,impute__median_dummy,True,positive,insignificant
adult,sex/race,predictive_parity,logistic_regression,outliers,IF,impute__mean_dummy,True,positive,positive
adult,sex/race,predictive_parity,logistic_regression,outliers,IF,impute__mode_dummy,True,positive,positive
adult,sex/race,predictive_parity,logistic_regression,outliers,IF,impute__median_dummy,True,positive,positive
folktables,sex/rac

german,age/sex,predictive_parity,knn_classification,outliers,SD,impute__mean_dummy,True,positive,insignificant
german,age/sex,predictive_parity,knn_classification,outliers,SD,impute__mode_dummy,True,positive,insignificant
german,age/sex,predictive_parity,knn_classification,outliers,SD,impute__median_dummy,True,positive,insignificant
german,age/sex,predictive_parity,knn_classification,outliers,IQR,impute__mean_dummy,True,insignificant,insignificant
german,age/sex,predictive_parity,knn_classification,outliers,IQR,impute__mode_dummy,True,insignificant,negative
german,age/sex,predictive_parity,knn_classification,outliers,IQR,impute__median_dummy,True,insignificant,insignificant
german,age/sex,predictive_parity,knn_classification,outliers,IF,impute__mean_dummy,True,positive,positive
german,age/sex,predictive_parity,knn_classification,outliers,IF,impute__mode_dummy,True,positive,insignificant
german,age/sex,predictive_parity,knn_classification,outliers,IF,impute__median_dummy,True,positive,p

german,age/sex,predictive_parity,logistic_regression,missing_values,,impute_mean_mode,True,insignificant,insignificant
german,age/sex,predictive_parity,logistic_regression,missing_values,,impute_mean_dummy,True,negative,insignificant
german,age/sex,predictive_parity,logistic_regression,missing_values,,impute_median_mode,True,negative,insignificant
german,age/sex,predictive_parity,logistic_regression,missing_values,,impute_median_dummy,True,insignificant,insignificant
german,age/sex,predictive_parity,logistic_regression,missing_values,,impute_mode_mode,True,negative,insignificant
german,age/sex,predictive_parity,logistic_regression,missing_values,,impute_mode_dummy,True,insignificant,insignificant
german,age/foreign_worker,predictive_parity,logistic_regression,missing_values,,impute_mean_mode,True,insignificant,insignificant
german,age/foreign_worker,predictive_parity,logistic_regression,missing_values,,impute_mean_dummy,True,insignificant,insignificant
german,age/foreign_worker,predict

# Generate tables

In [6]:
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# Configure jupysql to return data as a Pandas dataframe and have less verbose output
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect jupysql to in-memory DuckDB database
%sql duckdb:///:memory:

In [7]:
counts_query = f"""
    SELECT error, fairness_impact, accuracy_impact, COUNT(*) as count
    FROM '{results_filename}' 
    GROUP BY error, fairness_impact, accuracy_impact
    ORDER BY error, fairness_impact DESC, accuracy_impact DESC
"""
counts = %sql {{counts_query}}

def single(results, error, fairness_impact, accuracy_impact):
    result_slice = results[(results.error == error) & (results.fairness_impact == fairness_impact) & \
            (results.accuracy_impact == accuracy_impact)]
   
    if len(result_slice) > 0:
        return list(result_slice['count'])[0]                        
    else:
        return 0

def perc(count, total):
    return str(round((count / total) * 100, 1)) + f'\% ({count})'

for error in ['missing_values', 'outliers', 'mislabel']:
    print('%', error)
    cpn = single(counts, error, 'positive', 'negative')
    cpi = single(counts, error, 'positive', 'insignificant')
    cpp = single(counts, error, 'positive', 'positive')

    cin = single(counts, error, 'insignificant', 'negative')
    cii = single(counts, error, 'insignificant', 'insignificant')
    cip = single(counts, error, 'insignificant', 'positive')            

    cnn = single(counts, error, 'negative', 'negative')
    cni = single(counts, error, 'negative', 'insignificant')
    cnp = single(counts, error, 'negative', 'positive')

    total = cpn + cpi + cpp + cin + cii + cip + cnn + cni + cnp        

#     print('& negative & insign. & positive & \\\\')    
#     print('\\hline')
#     print('worse &', perc(cnn, total), '&', perc(cni, total), '&', perc(cnp, total), '&', perc(cnn + cni + cnp, total),'\\\\')
#     print('insign. &', perc(cin, total), '&', perc(cii, total), '&', perc(cip, total), '&', perc(cin + cii + cip, total),'\\\\')
#     print('better &', perc(cpn, total), '&', perc(cpi, total), '&', perc(cpp, total), '&', perc(cpn + cpi + cpp, total),'\\\\')
#     print('\\hline')
#     print(' &', perc(cpn + cin + cnn, total), '&', perc(cpi + cii + cni, total), '&', perc(cpp+cip+cnp, total), '& \\\\')

    tex_string = r"""
\begin{tabular}{cl|ccc|r}
& & \multicolumn{3}{|c|}{\textbf{accuracy}} & \\
& & \textbf{worse} & \textbf{insignificant} & \textbf{better} & \\
\hline
\multirow{3}{*}{\rotatebox{}{\textbf{fair.}}} & \textbf{worse} & """ + " & ".join([
    perc(cnn, total),
    perc(cni, total),
    perc(cnp, total),
    perc(cnn + cni + cnp, total),
]) + r""" \\
& \textbf{insign.} & """ + " & ".join([
    perc(cin, total),
    perc(cii, total),
    perc(cip, total),
    perc(cin + cii + cip, total),
]) + r""" \\
& \textbf{better} & """ + " & ".join([
    perc(cpn, total),
    perc(cpi, total),
    perc(cpp, total),
    perc(cpn + cpi + cpp, total),
]) + r""" \\
\hline
 && """ + " & ".join([
    perc(cpn + cin + cnn, total),
    perc(cpi + cii + cni, total),
    perc(cpp+cip+cnp, total),
]) + r""" & \\
\end{tabular}
"""
    print(tex_string)

    tex_filename = f"tables/int-{cardio_age_threshold}-{intersectional_formulation}/{error}.tex"
    with open(tex_filename, "w") as f:
        print(tex_string, file=f)

% missing_values
% outliers
% mislabel
