In [14]:
from demodq.datasets import Datasets
from demodq.column_errors import detect_missing_values, detect_outliers_sd, detect_outliers_iqr
from demodq.tuple_errors import detect_mislabeled_via_cleanlab, detect_outliers_via_if, detect_mislabeled_via_shapley
from demodq.analysis import analyse_marked, detect_disparate_errors, is_disparate

import warnings
warnings.filterwarnings('ignore')

In [12]:
def perc(frac):
  return str(round(frac * 100, 1)) + '\%'

In [20]:
import pandas as pd

cases = [('adult', 'sex'), ('adult', 'race'), ('folktables', 'sex'), ('folktables', 'race'), 
         ('heart', 'sex'), ('credit', 'age'), ('german', 'age')]

In [19]:
for dataset_name, criteria in cases:

    dataset = Datasets.load(dataset_name)
    
    data = dataset.as_df().copy(deep=True)
    data['id'] = range(len(data))

    dirty_slices = []

    for column in dataset.categorical_columns + dataset.numerical_columns:    
        dirty_slices.append(detect_missing_values(data, column))

    dirty = pd.concat(dirty_slices).drop_duplicates(subset='id')    

    data_priv, data_nonpriv = dataset.partition_data_by(data, criteria)
    dirty_priv, dirty_nonpriv = dataset.partition_data_by(dirty, criteria)

    disparate = is_disparate(len(data_priv), len(dirty_priv), len(data_nonpriv), len(dirty_nonpriv))
    
    frac_priv = len(dirty_priv) / len(data_priv)
    frac_nonpriv = len(dirty_nonpriv) / len(data_nonpriv)

    print(dataset_name, criteria, perc(frac_priv), perc(frac_nonpriv), disparate)

adult sex 6.5\% 9.2\% True
adult race 6.8\% 10.7\% True
folktables sex 44.4\% 51.7\% True
folktables race 47.7\% 48.7\% True
heart sex 0.0\% 0.0\% False
credit age 20.1\% 15.8\% True
german age 32.8\% 20.5\% True


In [21]:
for dataset_name, criteria in cases:

    dataset = Datasets.load(dataset_name)
    
    data = dataset.as_df().copy(deep=True)
    data['id'] = range(len(data))

    
    for detector, name in [(detect_outliers_sd, 'sd'), (detect_outliers_iqr, 'iqr')]:
        
        dirty_slices = []

        for column in dataset.numerical_columns:    
            dirty_slices.append(detector(data, column))

        dirty = pd.concat(dirty_slices).drop_duplicates(subset='id')    

        data_priv, data_nonpriv = dataset.partition_data_by(data, criteria)
        dirty_priv, dirty_nonpriv = dataset.partition_data_by(dirty, criteria)

        disparate = is_disparate(len(data_priv), len(dirty_priv), len(data_nonpriv), len(dirty_nonpriv))

        frac_priv = len(dirty_priv) / len(data_priv)
        frac_nonpriv = len(dirty_nonpriv) / len(data_nonpriv)

        print(dataset_name, criteria, name, perc(frac_priv), perc(frac_nonpriv), disparate)

adult sex sd 7.9\% 4.7\% True
adult sex iqr 36.6\% 38.6\% True
adult race sd 7.1\% 5.3\% True
adult race iqr 38.2\% 31.8\% True
folktables sex sd 0.7\% 0.3\% True
folktables sex iqr 0.0\% 0.0\% False
folktables race sd 0.5\% 0.5\% False
folktables race iqr 0.0\% 0.0\% False
heart sex sd 10.2\% 10.0\% False
heart sex iqr 24.1\% 22.9\% True
credit age sd 3.7\% 1.9\% True
credit age iqr 39.7\% 39.2\% False
german age sd 5.1\% 3.7\% False
german age iqr 30.2\% 14.2\% True
