In [1]:
import duckdb

con = duckdb.connect(database=':memory:')
con.execute("""
    SELECT model, error, fairness_impact, accuracy_impact, COUNT(*) as count
    FROM 'cleanml.csv' 
    GROUP BY model, error, fairness_impact, accuracy_impact
    ORDER BY model, error, fairness_impact DESC, accuracy_impact DESC
""").df()

Unnamed: 0,model,error,fairness_impact,accuracy_impact,count
0,XGBoost,mislabel,positive,positive,4
1,XGBoost,mislabel,negative,positive,4
2,XGBoost,mislabel,insignificant,positive,4
3,XGBoost,mislabel,insignificant,insignificant,2
4,XGBoost,missing_values,positive,positive,8
5,XGBoost,missing_values,positive,negative,2
6,XGBoost,missing_values,negative,positive,10
7,XGBoost,missing_values,negative,negative,3
8,XGBoost,missing_values,insignificant,positive,40
9,XGBoost,missing_values,insignificant,negative,3


## How often did data cleaning not negatively impacting fairness?

In [2]:
con.execute("""
    SELECT 
        model, error,    
        fairness_impact!='negative' AND accuracy_impact!='negative' as non_negative_impact, 
        COUNT(*) as count    
    FROM 'cleanml.csv' 
    GROUP BY model, error, non_negative_impact
    ORDER BY model, error, non_negative_impact DESC
""").df()

Unnamed: 0,model,error,non_negative_impact,count
0,XGBoost,missing_values,True,134
1,XGBoost,missing_values,False,10
2,XGBoost,outliers,True,147
3,XGBoost,outliers,False,69
4,knn_classification,missing_values,True,66
5,knn_classification,missing_values,False,78
6,knn_classification,outliers,True,187
7,knn_classification,outliers,False,29
8,logistic_regression,missing_values,True,86
9,logistic_regression,missing_values,False,58


## Is there a cleaning approach (per dataset, error and metric) to improve fairness?

In [3]:
con.execute("""
    SELECT DISTINCT model, dataset, error, metric, TRUE as cleaning_found
    FROM 'cleanml.csv' as csv
    WHERE EXISTS(
        SELECT model, dataset, error, metric FROM 'cleanml.csv' AS csv2
        WHERE 
            csv2.model = csv.model AND
            csv2.dataset = csv.dataset AND
            csv2.error = csv.error AND
            csv2.metric = csv.metric AND
            csv2.fairness_impact = 'positive'
    )    
""").df()

Unnamed: 0,model,dataset,error,metric,cleaning_found
0,logistic_regression,adult,outliers,equal_opportunity,True
1,XGBoost,adult,outliers,equal_opportunity,True
2,XGBoost,folktables,outliers,equal_opportunity,True
3,logistic_regression,folktables,missing_values,equal_opportunity,True
4,knn_classification,folktables,missing_values,equal_opportunity,True
5,knn_classification,german,missing_values,equal_opportunity,True
6,XGBoost,folktables,missing_values,equal_opportunity,True
7,logistic_regression,folktables,outliers,predictive_parity,True
8,logistic_regression,credit,outliers,predictive_parity,True
9,knn_classification,adult,outliers,predictive_parity,True


## Is there a cleaning approach (per dataset, error and metric) to improve fairness without decreasing accuracy?

In [4]:
con.execute("""
    SELECT DISTINCT model, dataset, error, metric, TRUE as cleaning_found
    FROM 'cleanml.csv' as csv
    WHERE EXISTS(
        SELECT model, dataset, error, metric FROM 'cleanml.csv' AS csv2
        WHERE 
            csv2.model = csv.model AND
            csv2.dataset = csv.dataset AND
            csv2.error = csv.error AND
            csv2.metric = csv.metric AND
            csv2.fairness_impact = 'positive' AND csv2.accuracy_impact!='negative'
    )   
""").df()

Unnamed: 0,model,dataset,error,metric,cleaning_found
0,XGBoost,adult,outliers,equal_opportunity,True
1,logistic_regression,folktables,missing_values,equal_opportunity,True
2,knn_classification,folktables,missing_values,equal_opportunity,True
3,knn_classification,german,missing_values,equal_opportunity,True
4,XGBoost,folktables,missing_values,equal_opportunity,True
5,logistic_regression,folktables,outliers,predictive_parity,True
6,logistic_regression,credit,outliers,predictive_parity,True
7,knn_classification,adult,outliers,predictive_parity,True
8,knn_classification,credit,outliers,predictive_parity,True
9,knn_classification,german,outliers,predictive_parity,True
