In [1]:
import duckdb

con = duckdb.connect(database=':memory:')

In [2]:
con.execute("""
    SELECT error, fairness_impact, accuracy_impact, COUNT(*) as count
    FROM 'cleanml.csv' 
    GROUP BY error, fairness_impact, accuracy_impact
    ORDER BY error, fairness_impact DESC, accuracy_impact DESC
""").df()

Unnamed: 0,error,fairness_impact,accuracy_impact,count
0,mislabel,positive,positive,10
1,mislabel,negative,positive,13
2,mislabel,negative,insignificant,1
3,mislabel,insignificant,positive,15
4,mislabel,insignificant,insignificant,3
5,missing_values,positive,positive,30
6,missing_values,positive,negative,3
7,missing_values,positive,insignificant,4
8,missing_values,negative,positive,24
9,missing_values,negative,negative,4


In [3]:
counts = con.execute("""
    SELECT error, fairness_impact, accuracy_impact, COUNT(*) as count
    FROM 'cleanml.csv' 
    GROUP BY error, fairness_impact, accuracy_impact
    ORDER BY error, fairness_impact DESC, accuracy_impact DESC
""").df()

def single(results, error, fairness_impact, accuracy_impact):
    result_slice = results[(results.error == error) & (results.fairness_impact == fairness_impact) & \
            (results.accuracy_impact == accuracy_impact)]
   
    if len(result_slice) > 0:
        return list(result_slice['count'])[0]                        
    else:
        return 0

def perc(count, total):
    return str(round((count / total) * 100, 1)) + f'\% ({count})'

for error in ['missing_values', 'outliers', 'mislabel']:

    print('%', error)
    cpn = single(counts, error, 'positive', 'negative')
    cpi = single(counts, error, 'positive', 'insignificant')
    cpp = single(counts, error, 'positive', 'positive')

    cin = single(counts, error, 'insignificant', 'negative')
    cii = single(counts, error, 'insignificant', 'insignificant')
    cip = single(counts, error, 'insignificant', 'positive')            

    cnn = single(counts, error, 'negative', 'negative')
    cni = single(counts, error, 'negative', 'insignificant')
    cnp = single(counts, error, 'negative', 'positive')

    total = cpn + cpi + cpp + cin + cii + cip + cnn + cni + cnp        

    print('& negative & insign. & positive & \\\\')    
    print('\\hline')
    print('worse &', perc(cnn, total), '&', perc(cni, total), '&', perc(cnp, total), '&', perc(cnn + cni + cnp, total),'\\\\')
    print('insign. &', perc(cin, total), '&', perc(cii, total), '&', perc(cip, total), '&', perc(cin + cii + cip, total),'\\\\')
    print('better &', perc(cpn, total), '&', perc(cpi, total), '&', perc(cpp, total), '&', perc(cpn + cpi + cpp, total),'\\\\')
    print('\\hline')
    print(' &', perc(cpn + cin + cnn, total), '&', perc(cpi + cii + cni, total), '&', perc(cpp+cip+cnp, total), '& \\\\')

             

% missing_values
& negative & insign. & positive & \\
\hline
worse & 1.9\% (4) & 10.6\% (23) & 11.1\% (24) & 23.6\% (51) \\
insign. & 9.7\% (21) & 25.5\% (55) & 24.1\% (52) & 59.3\% (128) \\
better & 1.4\% (3) & 1.9\% (4) & 13.9\% (30) & 17.1\% (37) \\
\hline
 & 13.0\% (28) & 38.0\% (82) & 49.1\% (106) & \\
% outliers
& negative & insign. & positive & \\
\hline
worse & 9.0\% (34) & 4.5\% (17) & 6.1\% (23) & 19.6\% (74) \\
insign. & 11.4\% (43) & 42.9\% (162) & 18.0\% (68) & 72.2\% (273) \\
better & 1.9\% (7) & 2.4\% (9) & 4.0\% (15) & 8.2\% (31) \\
\hline
 & 22.2\% (84) & 49.7\% (188) & 28.0\% (106) & \\
% mislabel
& negative & insign. & positive & \\
\hline
worse & 0.0\% (0) & 2.4\% (1) & 31.0\% (13) & 33.3\% (14) \\
insign. & 0.0\% (0) & 7.1\% (3) & 35.7\% (15) & 42.9\% (18) \\
better & 0.0\% (0) & 0.0\% (0) & 23.8\% (10) & 23.8\% (10) \\
\hline
 & 0.0\% (0) & 9.5\% (4) & 90.5\% (38) & \\


## How does this differ by dataset?

In [6]:
con.execute("""
    SELECT 
        dataset,
        error,    
        fairness_impact!='negative' as non_negative_impact_on_fairness, 
        COUNT(*) as count    
    FROM 'cleanml.csv' 
    GROUP BY dataset, error, non_negative_impact_on_fairness
    ORDER BY dataset, error, non_negative_impact_on_fairness DESC
""").df()

Unnamed: 0,dataset,error,non_negative_impact_on_fairness,count
0,adult,mislabel,True,6
1,adult,mislabel,False,6
2,adult,missing_values,True,53
3,adult,missing_values,False,19
4,adult,outliers,True,79
5,adult,outliers,False,29
6,credit,mislabel,True,4
7,credit,mislabel,False,2
8,credit,missing_values,True,36
9,credit,outliers,True,29


## For which scenarios (dataset, error and metric) did we find a cleaning approach with non-neg impact on  fairness?

In [9]:
con.execute("""
    SELECT DISTINCT dataset, error, criteria, metric
    FROM 'cleanml.csv' as csv
    WHERE EXISTS(
        SELECT dataset, error, criteria, metric FROM 'cleanml.csv' AS csv2
        WHERE 
            csv2.criteria = csv.criteria AND
            csv2.dataset = csv.dataset AND
            csv2.error = csv.error AND
            csv2.metric = csv.metric AND
            csv2.fairness_impact != 'negative'
    )    
    ORDER BY dataset, error, criteria, metric
""").df()

Unnamed: 0,dataset,error,criteria,metric
0,adult,mislabel,race,predictive_parity
1,adult,mislabel,sex,predictive_parity
2,adult,missing_values,race,equal_opportunity
3,adult,missing_values,race,predictive_parity
4,adult,missing_values,sex,equal_opportunity
5,adult,missing_values,sex,predictive_parity
6,adult,outliers,race,equal_opportunity
7,adult,outliers,race,predictive_parity
8,adult,outliers,sex,equal_opportunity
9,adult,outliers,sex,predictive_parity


## For which scenarios (dataset, error and metric) did we find a cleaning approach with positive impact on  fairness?

In [8]:
con.execute("""
    SELECT DISTINCT dataset, metric, criteria, error
    FROM 'cleanml.csv' as csv
    WHERE EXISTS(
        SELECT dataset, error, criteria, metric FROM 'cleanml.csv' AS csv2
        WHERE 
            csv2.criteria = csv.criteria AND
            csv2.dataset = csv.dataset AND
            csv2.error = csv.error AND
            csv2.metric = csv.metric AND
            csv2.fairness_impact == 'positive'
    )    
    ORDER BY dataset, metric, error, criteria
""").df()

Unnamed: 0,dataset,metric,criteria,error
0,adult,equal_opportunity,race,outliers
1,adult,predictive_parity,race,mislabel
2,adult,predictive_parity,sex,mislabel
3,adult,predictive_parity,race,missing_values
4,adult,predictive_parity,race,outliers
5,adult,predictive_parity,sex,outliers
6,credit,predictive_parity,age,outliers
7,folktables,equal_opportunity,rac1p,missing_values
8,folktables,equal_opportunity,sex,missing_values
9,folktables,equal_opportunity,sex,outliers


## For which scenarios (dataset, error and metric) did we find a cleaning approach to improve fairness and accuracy?

In [7]:
con.execute("""
    SELECT DISTINCT dataset, metric, criteria, error
    FROM 'cleanml.csv' as csv
    WHERE EXISTS(
        SELECT dataset, error, criteria, metric FROM 'cleanml.csv' AS csv2
        WHERE 
            csv2.criteria = csv.criteria AND    
            csv2.dataset = csv.dataset AND
            csv2.error = csv.error AND
            csv2.metric = csv.metric AND
            csv2.fairness_impact = 'positive' AND csv2.accuracy_impact='positive'
    )   
    ORDER BY dataset, metric, error, criteria
""").df()

Unnamed: 0,dataset,metric,criteria,error
0,adult,predictive_parity,race,mislabel
1,adult,predictive_parity,sex,mislabel
2,adult,predictive_parity,race,missing_values
3,adult,predictive_parity,race,outliers
4,adult,predictive_parity,sex,outliers
5,credit,predictive_parity,age,outliers
6,folktables,equal_opportunity,rac1p,missing_values
7,folktables,equal_opportunity,sex,missing_values
8,folktables,predictive_parity,rac1p,mislabel
9,folktables,predictive_parity,sex,mislabel


In [11]:
con.execute("""
    SELECT DISTINCT model, COUNT(*) AS count
    FROM 'cleanml.csv'   
    WHERE fairness_impact == 'negative'
    GROUP BY model
    ORDER BY model
""").df()

Unnamed: 0,model,count
0,XGBoost,45
1,knn_classification,52
2,logistic_regression,42


In [12]:
con.execute("""
    SELECT DISTINCT model, COUNT(*) AS count
    FROM 'cleanml.csv'   
    WHERE fairness_impact != 'negative'
    GROUP BY model
    ORDER BY model
""").df()

Unnamed: 0,model,count
0,XGBoost,167
1,knn_classification,160
2,logistic_regression,170


In [13]:
con.execute("""
    SELECT DISTINCT model, COUNT(*) AS count
    FROM 'cleanml.csv'   
    WHERE fairness_impact = 'positive'
    GROUP BY model
    ORDER BY model
""").df()

Unnamed: 0,model,count
0,XGBoost,23
1,knn_classification,29
2,logistic_regression,26


In [14]:
con.execute("""
    SELECT DISTINCT model, COUNT(*) AS count
    FROM 'cleanml.csv'   
    WHERE fairness_impact = 'positive' AND accuracy_impact = 'positive'
    GROUP BY model
    ORDER BY model
""").df()

Unnamed: 0,model,count
0,XGBoost,14
1,knn_classification,25
2,logistic_regression,16


In [4]:
con.execute("""
    SELECT DISTINCT model, COUNT(*) AS count
    FROM 'cleanml.csv'   
    WHERE fairness_impact = 'positive' AND accuracy_impact = 'positive'
    GROUP BY model
    ORDER BY model
""").df()

Unnamed: 0,model,count
0,XGBoost,14
1,knn_classification,25
2,logistic_regression,16


In [5]:
con.execute("""
    SELECT metric, fairness_impact, COUNT(*) AS count, COUNT(*) / 318.0 as perc
    FROM 'cleanml.csv'   
    GROUP BY metric, fairness_impact
    ORDER BY metric, fairness_impact
""").df()

Unnamed: 0,metric,fairness_impact,count,perc
0,equal_opportunity,insignificant,190,0.597484
1,equal_opportunity,negative,104,0.327044
2,equal_opportunity,positive,24,0.075472
3,predictive_parity,insignificant,229,0.720126
4,predictive_parity,negative,35,0.110063
5,predictive_parity,positive,54,0.169811


In [6]:
con.execute("""
    SELECT dataset, metric, count(*) as count
    FROM 'cleanml.csv'   
    WHERE error = 'missing_values' AND fairness_impact='negative' AND accuracy_impact='insignificant'
    GROUP BY dataset, metric
    ORDER BY dataset, metric
""").df()

Unnamed: 0,dataset,metric,count
0,adult,equal_opportunity,14
1,folktables,equal_opportunity,1
2,german,predictive_parity,8


In [31]:
con.execute("""
    SELECT dataset, metric, count(*) as count
    FROM 'cleanml.csv'   
    WHERE error = 'missing_values' AND fairness_impact='positive' AND accuracy_impact='positive'
    GROUP BY dataset, metric
    ORDER BY dataset, metric
""").df()

Unnamed: 0,dataset,metric,count
0,adult,predictive_parity,3
1,folktables,equal_opportunity,15
2,folktables,predictive_parity,11
3,german,predictive_parity,1
