In [1]:
import numpy as np
import json

folder='cleanml-results/'

with open(f'{folder}/Credit_result.json') as f:
    credit_data = json.load(f)

with open(f'{folder}/USCensus_result.json') as f:
    adult_data = json.load(f)    
    
with open(f'{folder}/ACSIncome_result.json') as f:
    folk_data = json.load(f)        
    
with open(f'{folder}/Cardio_result.json') as f:
    heart_data = json.load(f)     
    
with open(f'{folder}/GermanCredit_result.json') as f:
    german_data = json.load(f)            

In [2]:
from scipy.stats import ttest_rel

def t_test(dirty, clean):
    """Comparing method"""
    def two_tailed_t_test(dirty, clean):
        n_d = len(dirty)
        n_c = len(clean)
        n = min(n_d, n_c)
        t, p = ttest_rel(clean[:n], dirty[:n])
        if np.isnan(t):
            t, p = 0, 1
        return {"t-stats":t, "p-value":p}

    def one_tailed_t_test(dirty, clean, direction):
        two_tail = two_tailed_t_test(dirty, clean)
        t, p_two = two_tail['t-stats'], two_tail['p-value']
        if direction == 'positive':
            if t > 0 :
                p = p_two * 0.5
            else:
                p = 1 - p_two * 0.5
        else:
            if t < 0:
                p = p_two * 0.5
            else:
                p = 1 - p_two * 0.5
        return {"t-stats":t, "p-value":p}

    result = {}
    result['two_tail'] = two_tailed_t_test(dirty, clean)
    result['one_tail_pos'] = one_tailed_t_test(dirty, clean, 'positive')
    result['one_tail_neg'] = one_tailed_t_test(dirty, clean, 'negative')
    return result

In [3]:
def compute_eo(stats, cleaning_method, criteria, flipped):
    prefix = f'{cleaning_method}__{criteria}_'
    
    if not flipped:                    
        return (stats[f'{prefix}priv__tp'] / (stats[f'{prefix}priv__tp'] + stats[f'{prefix}priv__fn'])) - \
             (stats[f'{prefix}dis__tp'] / (stats[f'{prefix}dis__tp'] + stats[f'{prefix}dis__fn']))
    else:
        return (stats[f'{prefix}priv__tn'] / (stats[f'{prefix}priv__tn'] + stats[f'{prefix}priv__fp'])) - \
             (stats[f'{prefix}dis__tn'] / (stats[f'{prefix}dis__tn'] + stats[f'{prefix}dis__fp']))    

def compute_pp(stats, cleaning_method, criteria, flipped):
    prefix = f'{cleaning_method}__{criteria}_'
    
    if not flipped:                    
        return (stats[f'{prefix}priv__tp'] / (stats[f'{prefix}priv__tp'] + stats[f'{prefix}priv__fp'])) - \
             (stats[f'{prefix}dis__tp'] / (stats[f'{prefix}dis__tp'] + stats[f'{prefix}dis__fp']))
    else:
        return (stats[f'{prefix}priv__tn'] / (stats[f'{prefix}priv__tn'] + stats[f'{prefix}priv__fn'])) - \
             (stats[f'{prefix}dis__tn'] / (stats[f'{prefix}dis__tn'] + stats[f'{prefix}dis__fn']))        
    

def count(data, dataset_name, target_criteria, error_type, model, metric_name, scoring, log_file, flipped=False):

    dirty_scores = []
    dirty_accs = []
    
    cleaning_scores = {}
    cleaning_accs = {}
    
    dirty = 'dirty'
    if error_type == 'missing_values':
        dirty = 'delete'
    
    for experiment in data.keys():
        if error_type in experiment and model in experiment:        
            split_seed = experiment.split("/")[1]
            train_method = experiment.split("/")[3]   
            retrain_seed = experiment.split("/")[5]

            if train_method == dirty:
                # Missing values need special treatment, just deleting the corresponding rows from the test set
                # is not applicable in real-world scenarios, so we set a default way to treat the test data
                if error_type == 'missing_values':
                    score = scoring(data[experiment], 'impute_mean_dummy', target_criteria, flipped)
                    dirty_scores.append(score)                        
                    dirty_accs.append(data[experiment]['impute_mean_dummy_test_acc'])                    
                else:
                    score = scoring(data[experiment], dirty, target_criteria, flipped)
                    dirty_scores.append(score)                        
                    dirty_accs.append(data[experiment][dirty + '_test_acc'])


            if train_method != dirty:

                for test_method in [dirty, train_method]:
                    
                    approach = (train_method, test_method)
                    
                    if approach not in cleaning_scores:
                        cleaning_scores[approach] = []

                    if error_type == 'mislabel':
                        scores = scoring(data[experiment], 'clean', target_criteria, flipped)
                    else:     
                        scores = scoring(data[experiment], test_method, target_criteria, flipped)
                    cleaning_scores[approach].append(scores)

                    if approach not in cleaning_accs:
                        cleaning_accs[approach] = []

                    if test_method == dirty:
                        cleaning_accs[approach].append(data[experiment][f'{dirty}_test_acc'])
                    else:    
                        if error_type != 'mislabel':
                            cleaning_accs[approach].append(data[experiment][f'{train_method}_test_acc'])
                        
    evaluate_scores(dirty_scores, cleaning_scores, dirty_accs, cleaning_accs, 
                    dataset_name, target_criteria, metric_name, model, error_type, log_file)     

In [4]:
def evaluate_scores(dirty_scores, cleaning_scores, dirty_accs, cleaning_accs, 
                    dataset_name, target_criteria, metric_name, model, error_type, log_file):
                                                                          
    if len(cleaning_scores) > 0:        
            
        # bonferroni correction 
        alpha = 0.05 / len(cleaning_scores)

        for method, scores in cleaning_scores.items():
            
            test_results = t_test(dirty_scores, scores)

            repair_train, repair_clean = method
            test_repaired = repair_train == repair_clean

            difference = 'insignificant'

            if test_results['two_tail']['p-value'] < alpha:
                if test_results['one_tail_neg']['p-value'] < alpha:
                    difference = 'positive'
                if test_results['one_tail_pos']['p-value'] < alpha:
                    difference = 'negative'

                    
            acc_test_results = t_test(dirty_accs, cleaning_accs[method])

            acc_difference = 'insignificant'

            if acc_test_results['two_tail']['p-value'] < alpha:
                if acc_test_results['one_tail_neg']['p-value'] < alpha:
                    acc_difference = 'negative'
                if acc_test_results['one_tail_pos']['p-value'] < alpha:
                    acc_difference = 'positive'                                  
                        
            if error_type == 'missing_values':
                repair_method = repair_train
                detection=''
            elif error_type == 'mislabel':
                tokens = repair_train.split('-')
                repair_method = tokens[1]
                detection=tokens[0]
            else:    
                tokens = repair_train.split("_impute")        
                detection = tokens[0].replace('clean_', '')
                repair_method = 'impute_' + tokens[1]
                
            if not (error_type == 'mislabel' and (test_repaired or detection == 'shapley')):        
                          
                line = f'{dataset_name},{target_criteria},{metric_name},{model},{error_type},{detection},{repair_method},{test_repaired},{difference},{acc_difference}'    

                if test_repaired or error_type == 'mislabel':
                    print(line)            
                    log_file.write(f'{line}\n')
            

In [5]:
errors = ['outliers', 'missing_values', 'mislabel']
models = ['logistic_regression', 'knn_classification', 'XGBoost']
metrics = [('equal_opportunity', compute_eo), ('predictive_parity', compute_pp)]


with open('/Users/ssc/projects/demo-dq/cleanml.csv', 'w') as log_file:
    
    log_file.write('dataset,criteria,metric,model,error,detection,repair,test_repaired,fairness_impact,accuracy_impact\n')
    
    for metric, scoring in metrics:
        for error in errors:    
            for model in models:
                count(adult_data, 'adult', 'sex', error, model, metric, scoring, log_file)
                count(adult_data, 'adult', 'race', error, model, metric, scoring, log_file)
                count(folk_data, 'folktables', 'sex', error, model, metric, scoring, log_file)
                count(folk_data, 'folktables', 'rac1p', error, model, metric, scoring, log_file)            
                count(credit_data, 'credit', 'age', error, model, metric, scoring, log_file, flipped=True)
                count(german_data, 'german', 'age', error, model, metric, scoring, log_file, flipped=True)  
                count(heart_data, 'heart', 'gender', error, model, metric, scoring, log_file, flipped=False)              
            

adult,sex,equal_opportunity,logistic_regression,outliers,SD,impute__mean_dummy,True,insignificant,positive
adult,sex,equal_opportunity,logistic_regression,outliers,SD,impute__mode_dummy,True,insignificant,negative
adult,sex,equal_opportunity,logistic_regression,outliers,SD,impute__median_dummy,True,insignificant,insignificant
adult,sex,equal_opportunity,logistic_regression,outliers,IQR,impute__mean_dummy,True,negative,negative
adult,sex,equal_opportunity,logistic_regression,outliers,IQR,impute__mode_dummy,True,negative,negative
adult,sex,equal_opportunity,logistic_regression,outliers,IQR,impute__median_dummy,True,negative,negative
adult,sex,equal_opportunity,logistic_regression,outliers,IF,impute__mean_dummy,True,insignificant,positive
adult,sex,equal_opportunity,logistic_regression,outliers,IF,impute__mode_dummy,True,insignificant,positive
adult,sex,equal_opportunity,logistic_regression,outliers,IF,impute__median_dummy,True,negative,insignificant
adult,race,equal_opportunity,logistic_

folktables,sex,equal_opportunity,XGBoost,outliers,IQR,impute__median_dummy,True,positive,negative
folktables,sex,equal_opportunity,XGBoost,outliers,IF,impute__mean_dummy,True,insignificant,insignificant
folktables,sex,equal_opportunity,XGBoost,outliers,IF,impute__mode_dummy,True,insignificant,insignificant
folktables,sex,equal_opportunity,XGBoost,outliers,IF,impute__median_dummy,True,insignificant,insignificant
folktables,rac1p,equal_opportunity,XGBoost,outliers,SD,impute__mean_dummy,True,insignificant,insignificant
folktables,rac1p,equal_opportunity,XGBoost,outliers,SD,impute__mode_dummy,True,insignificant,insignificant
folktables,rac1p,equal_opportunity,XGBoost,outliers,SD,impute__median_dummy,True,insignificant,insignificant
folktables,rac1p,equal_opportunity,XGBoost,outliers,IQR,impute__mean_dummy,True,insignificant,negative
folktables,rac1p,equal_opportunity,XGBoost,outliers,IQR,impute__mode_dummy,True,insignificant,negative
folktables,rac1p,equal_opportunity,XGBoost,outliers,IQR,