In [1]:
import numpy as np
import json

folder = '/Users/guha/workspace/schelterlabs/CleanML/result-wagyu_20230719'
cardio_age_threshold = 'age@55'

# with open(f'{folder}/Credit_result.json') as f:
#     credit_data = json.load(f)

with open(f'{folder}/USCensus_result.json') as f:
    adult_data = json.load(f)    
    
with open(f'{folder}/ACSIncome_result.json') as f:
    folk_data = json.load(f)        
    
with open(f'{folder}/Cardio-{cardio_age_threshold}_result.json') as f:
    heart_data = json.load(f)     
    
with open(f'{folder}/GermanCredit_result.json') as f:
    german_data = json.load(f)            

In [2]:
from scipy.stats import ttest_rel

def t_test(dirty, clean):
    """Comparing method"""
    def two_tailed_t_test(dirty, clean):
        n_d = len(dirty)
        n_c = len(clean)
        n = min(n_d, n_c)
        t, p = ttest_rel(clean[:n], dirty[:n])
        if np.isnan(t):
            t, p = 0, 1
        return {"t-stats":t, "p-value":p}

    def one_tailed_t_test(dirty, clean, direction):
        two_tail = two_tailed_t_test(dirty, clean)
        t, p_two = two_tail['t-stats'], two_tail['p-value']
        if direction == 'positive':
            if t > 0 :
                p = p_two * 0.5
            else:
                p = 1 - p_two * 0.5
        else:
            if t < 0:
                p = p_two * 0.5
            else:
                p = 1 - p_two * 0.5
        return {"t-stats":t, "p-value":p}

    result = {}
    result['two_tail'] = two_tailed_t_test(dirty, clean)
    result['one_tail_pos'] = one_tailed_t_test(dirty, clean, 'positive')
    result['one_tail_neg'] = one_tailed_t_test(dirty, clean, 'negative')
    return result

In [3]:
def compute_eo(stats, cleaning_method, criteria, flipped):
    prefix = f'{cleaning_method}__{criteria}_'
    
    if not flipped:                    
        return (stats[f'{prefix}priv__tp'] / (stats[f'{prefix}priv__tp'] + stats[f'{prefix}priv__fn'])) - \
             (stats[f'{prefix}dis__tp'] / (stats[f'{prefix}dis__tp'] + stats[f'{prefix}dis__fn']))
    else:
        return (stats[f'{prefix}priv__tn'] / (stats[f'{prefix}priv__tn'] + stats[f'{prefix}priv__fp'])) - \
             (stats[f'{prefix}dis__tn'] / (stats[f'{prefix}dis__tn'] + stats[f'{prefix}dis__fp']))    

def compute_pp(stats, cleaning_method, criteria, flipped):
    prefix = f'{cleaning_method}__{criteria}_'
    
    if not flipped:                    
        return (stats[f'{prefix}priv__tp'] / (stats[f'{prefix}priv__tp'] + stats[f'{prefix}priv__fp'])) - \
             (stats[f'{prefix}dis__tp'] / (stats[f'{prefix}dis__tp'] + stats[f'{prefix}dis__fp']))
    else:
        return (stats[f'{prefix}priv__tn'] / (stats[f'{prefix}priv__tn'] + stats[f'{prefix}priv__fn'])) - \
             (stats[f'{prefix}dis__tn'] / (stats[f'{prefix}dis__tn'] + stats[f'{prefix}dis__fn']))        
    

def count(data, dataset_name, target_criteria, error_type, model, metric_name, scoring, log_file, flipped=False):

    dirty_scores = []
    dirty_accs = []
    
    cleaning_scores = {}
    cleaning_accs = {}
    
    dirty = 'dirty'
    if error_type == 'missing_values':
        dirty = 'delete'
    
    for experiment in data.keys():
        if error_type in experiment and model in experiment:        
            split_seed = experiment.split("/")[1]
            train_method = experiment.split("/")[3]   
            retrain_seed = experiment.split("/")[5]

            if train_method == dirty:
                # Missing values need special treatment, just deleting the corresponding rows from the test set
                # is not applicable in real-world scenarios, so we set a default way to treat the test data
                if error_type == 'missing_values':
                    score = scoring(data[experiment], 'impute_mean_dummy', target_criteria, flipped)
                    dirty_scores.append(score)                        
                    dirty_accs.append(data[experiment]['impute_mean_dummy_test_acc'])                    
                else:
                    score = scoring(data[experiment], dirty, target_criteria, flipped)
                    dirty_scores.append(score)                        
                    dirty_accs.append(data[experiment][dirty + '_test_acc'])


            if train_method != dirty:

                for test_method in [dirty, train_method]:
                    
                    approach = (train_method, test_method)
                    
                    if approach not in cleaning_scores:
                        cleaning_scores[approach] = []

                    if error_type == 'mislabel':
                        scores = scoring(data[experiment], 'clean', target_criteria, flipped)
                    else:     
                        scores = scoring(data[experiment], test_method, target_criteria, flipped)
                    cleaning_scores[approach].append(scores)

                    if approach not in cleaning_accs:
                        cleaning_accs[approach] = []

                    if test_method == dirty:
                        cleaning_accs[approach].append(data[experiment][f'{dirty}_test_acc'])
                    else:    
                        if error_type != 'mislabel':
                            cleaning_accs[approach].append(data[experiment][f'{train_method}_test_acc'])
                        
    evaluate_scores(dirty_scores, cleaning_scores, dirty_accs, cleaning_accs, 
                    dataset_name, target_criteria, metric_name, model, error_type, log_file)     

In [4]:
def evaluate_scores(dirty_scores, cleaning_scores, dirty_accs, cleaning_accs, 
                    dataset_name, target_criteria, metric_name, model, error_type, log_file):
                                                                          
    if len(cleaning_scores) > 0:        
            
        # bonferroni correction 
        alpha = 0.05 / len(cleaning_scores)

        for method, scores in cleaning_scores.items():
            
            test_results = t_test(dirty_scores, scores)

            repair_train, repair_clean = method
            test_repaired = repair_train == repair_clean

            difference = 'insignificant'

            if test_results['two_tail']['p-value'] < alpha:
                if test_results['one_tail_neg']['p-value'] < alpha:
                    difference = 'positive'
                if test_results['one_tail_pos']['p-value'] < alpha:
                    difference = 'negative'

                    
            acc_test_results = t_test(dirty_accs, cleaning_accs[method])

            acc_difference = 'insignificant'

            if acc_test_results['two_tail']['p-value'] < alpha:
                if acc_test_results['one_tail_neg']['p-value'] < alpha:
                    acc_difference = 'negative'
                if acc_test_results['one_tail_pos']['p-value'] < alpha:
                    acc_difference = 'positive'                                  
                        
            if error_type == 'missing_values':
                repair_method = repair_train
                detection=''
            elif error_type == 'mislabel':
                tokens = repair_train.split('-')
                repair_method = tokens[1]
                detection=tokens[0]
            else:    
                tokens = repair_train.split("_impute")        
                detection = tokens[0].replace('clean_', '')
                repair_method = 'impute_' + tokens[1]
                
            if not (error_type == 'mislabel' and (test_repaired or detection == 'shapley')):        
                          
                line = f'{dataset_name},{target_criteria},{metric_name},{model},{error_type},{detection},{repair_method},{test_repaired},{difference},{acc_difference}'    

                if test_repaired or error_type == 'mislabel':
                    print(line)            
                    log_file.write(f'{line}\n')
            

In [5]:
errors = ['outliers', 'missing_values', 'mislabel']
models = ['logistic_regression', 'knn_classification', 'XGBoost']
metrics = [('equal_opportunity', compute_eo), ('predictive_parity', compute_pp)]

results_filename = f'cleanml_{cardio_age_threshold}_20230727.csv'
with open(results_filename, 'w') as log_file:
    
    log_file.write('dataset,criteria,metric,model,error,detection,repair,test_repaired,fairness_impact,accuracy_impact\n')
    
    for metric, scoring in metrics:
        for error in errors:    
            for model in models:
                count(adult_data, 'adult', 'sex', error, model, metric, scoring, log_file)
                count(adult_data, 'adult', 'race', error, model, metric, scoring, log_file)
                count(folk_data, 'folktables', 'sex', error, model, metric, scoring, log_file)
                count(folk_data, 'folktables', 'rac1p', error, model, metric, scoring, log_file)            
                # count(credit_data, 'credit', 'age', error, model, metric, scoring, log_file, flipped=True)
                count(german_data, 'german', 'age', error, model, metric, scoring, log_file, flipped=True)  
                count(heart_data, 'heart', 'gender', error, model, metric, scoring, log_file, flipped=False)              


adult,sex,equal_opportunity,logistic_regression,outliers,SD,impute__mean_dummy,True,insignificant,positive
adult,sex,equal_opportunity,logistic_regression,outliers,SD,impute__mode_dummy,True,insignificant,positive
adult,sex,equal_opportunity,logistic_regression,outliers,SD,impute__median_dummy,True,insignificant,positive
adult,sex,equal_opportunity,logistic_regression,outliers,IQR,impute__mean_dummy,True,negative,positive
adult,sex,equal_opportunity,logistic_regression,outliers,IQR,impute__mode_dummy,True,negative,insignificant
adult,sex,equal_opportunity,logistic_regression,outliers,IQR,impute__median_dummy,True,negative,insignificant
adult,sex,equal_opportunity,logistic_regression,outliers,IF,impute__mean_dummy,True,negative,positive
adult,sex,equal_opportunity,logistic_regression,outliers,IF,impute__mode_dummy,True,insignificant,positive
adult,sex,equal_opportunity,logistic_regression,outliers,IF,impute__median_dummy,True,insignificant,positive
adult,race,equal_opportunity,logistic_

adult,sex,predictive_parity,XGBoost,outliers,SD,impute__mean_dummy,True,positive,positive
adult,sex,predictive_parity,XGBoost,outliers,SD,impute__mode_dummy,True,insignificant,positive
adult,sex,predictive_parity,XGBoost,outliers,SD,impute__median_dummy,True,insignificant,positive
adult,sex,predictive_parity,XGBoost,outliers,IQR,impute__mean_dummy,True,insignificant,positive
adult,sex,predictive_parity,XGBoost,outliers,IQR,impute__mode_dummy,True,insignificant,positive
adult,sex,predictive_parity,XGBoost,outliers,IQR,impute__median_dummy,True,insignificant,positive
adult,sex,predictive_parity,XGBoost,outliers,IF,impute__mean_dummy,True,insignificant,positive
adult,sex,predictive_parity,XGBoost,outliers,IF,impute__mode_dummy,True,insignificant,positive
adult,sex,predictive_parity,XGBoost,outliers,IF,impute__median_dummy,True,insignificant,positive
adult,race,predictive_parity,XGBoost,outliers,SD,impute__mean_dummy,True,positive,positive
adult,race,predictive_parity,XGBoost,outliers,SD,i

# Generate tables

In [6]:
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# Configure jupysql to return data as a Pandas dataframe and have less verbose output
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect jupysql to in-memory DuckDB database
%sql duckdb:///:memory:

In [7]:
counts_query = f"""
    SELECT error, fairness_impact, accuracy_impact, COUNT(*) as count
    FROM '{results_filename}' 
    GROUP BY error, fairness_impact, accuracy_impact
    ORDER BY error, fairness_impact DESC, accuracy_impact DESC
"""
counts = %sql {{counts_query}}

def single(results, error, fairness_impact, accuracy_impact):
    result_slice = results[(results.error == error) & (results.fairness_impact == fairness_impact) & \
            (results.accuracy_impact == accuracy_impact)]
   
    if len(result_slice) > 0:
        return list(result_slice['count'])[0]                        
    else:
        return 0

def perc(count, total):
    return str(round((count / total) * 100, 1)) + f'\% ({count})'

for error in ['missing_values', 'outliers', 'mislabel']:
    print('%', error)
    cpn = single(counts, error, 'positive', 'negative')
    cpi = single(counts, error, 'positive', 'insignificant')
    cpp = single(counts, error, 'positive', 'positive')

    cin = single(counts, error, 'insignificant', 'negative')
    cii = single(counts, error, 'insignificant', 'insignificant')
    cip = single(counts, error, 'insignificant', 'positive')            

    cnn = single(counts, error, 'negative', 'negative')
    cni = single(counts, error, 'negative', 'insignificant')
    cnp = single(counts, error, 'negative', 'positive')

    total = cpn + cpi + cpp + cin + cii + cip + cnn + cni + cnp        

#     print('& negative & insign. & positive & \\\\')    
#     print('\\hline')
#     print('worse &', perc(cnn, total), '&', perc(cni, total), '&', perc(cnp, total), '&', perc(cnn + cni + cnp, total),'\\\\')
#     print('insign. &', perc(cin, total), '&', perc(cii, total), '&', perc(cip, total), '&', perc(cin + cii + cip, total),'\\\\')
#     print('better &', perc(cpn, total), '&', perc(cpi, total), '&', perc(cpp, total), '&', perc(cpn + cpi + cpp, total),'\\\\')
#     print('\\hline')
#     print(' &', perc(cpn + cin + cnn, total), '&', perc(cpi + cii + cni, total), '&', perc(cpp+cip+cnp, total), '& \\\\')

    tex_string = r"""
\begin{tabular}{cl|ccc|r}
& & \multicolumn{3}{|c|}{\textbf{accuracy}} & \\
& & \textbf{worse} & \textbf{insignificant} & \textbf{better} & \\
\hline
\multirow{3}{*}{\rotatebox{}{\textbf{fair.}}} & \textbf{worse} & """ + " & ".join([
    perc(cnn, total),
    perc(cni, total),
    perc(cnp, total),
    perc(cnn + cni + cnp, total),
]) + r""" \\
& \textbf{insign.} & """ + " & ".join([
    perc(cin, total),
    perc(cii, total),
    perc(cip, total),
    perc(cin + cii + cip, total),
]) + r""" \\
& \textbf{better} & """ + " & ".join([
    perc(cpn, total),
    perc(cpi, total),
    perc(cpp, total),
    perc(cpn + cpi + cpp, total),
]) + r""" \\
\hline
 && """ + " & ".join([
    perc(cpn + cin + cnn, total),
    perc(cpi + cii + cni, total),
    perc(cpp+cip+cnp, total),
]) + r""" & \\
\end{tabular}
"""
    print(tex_string)

    tex_filename = f"tables/{cardio_age_threshold}/{error}.tex"
    with open(tex_filename, "w") as f:
        print(tex_string, file=f)

% missing_values

\begin{tabular}{cl|ccc|r}
& & \multicolumn{3}{|c|}{\textbf{accuracy}} & \\
& & \textbf{worse} & \textbf{insignificant} & \textbf{better} & \\
\hline
\multirow{3}{*}{\rotatebox{}{\textbf{fair.}}} & \textbf{worse} & 5.0\% (9) & 10.0\% (18) & 23.9\% (43) & 38.9\% (70) \\
& \textbf{insign.} & 3.3\% (6) & 6.7\% (12) & 22.8\% (41) & 32.8\% (59) \\
& \textbf{better} & 5.0\% (9) & 3.3\% (6) & 20.0\% (36) & 28.3\% (51) \\
\hline
 && 13.3\% (24) & 20.0\% (36) & 66.7\% (120) & \\
\end{tabular}

% outliers

\begin{tabular}{cl|ccc|r}
& & \multicolumn{3}{|c|}{\textbf{accuracy}} & \\
& & \textbf{worse} & \textbf{insignificant} & \textbf{better} & \\
\hline
\multirow{3}{*}{\rotatebox{}{\textbf{fair.}}} & \textbf{worse} & 1.9\% (6) & 2.2\% (7) & 16.7\% (54) & 20.7\% (67) \\
& \textbf{insign.} & 1.9\% (6) & 12.7\% (41) & 26.9\% (87) & 41.4\% (134) \\
& \textbf{better} & 1.9\% (6) & 3.7\% (12) & 32.4\% (105) & 38.0\% (123) \\
\hline
 && 5.6\% (18) & 18.5\% (60) & 75.9\% (246) & \\
\end{