In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


## Import a dataset

In [5]:
from demodq.datasets import Datasets

# Available datasets: 'adult', 'folktables','heart', 'credit', 'german'


## Detect dirty records

In [6]:
from demodq.column_errors import detect_missing_values, detect_outliers_sd, detect_outliers_iqr
from demodq.tuple_errors import detect_mislabeled_via_shapley, detect_mislabeled_via_cleanlab, detect_outliers_via_if



In [7]:
from demodq.analysis import is_disparate

In [8]:
cases = [('adult', 'sex'), ('adult', 'race'), ('folktables', 'sex'), ('folktables', 'race'),
         ('heart', 'sex'), ('german', 'age')]

def val_or_zero(key, dictionary):
    if key in dictionary:
        return dictionary[key]
    else:
        return 0.0

for detector in ['cleanlab']:    
    for dataset_name, criteria in cases:
        dataset = Datasets.load(dataset_name)

        df = dataset.as_df()
        if detector == 'shapley':
            dirty = detect_mislabeled_via_shapley(df, dataset, seed=42)
        else:    
            dirty = detect_mislabeled_via_cleanlab(df, dataset, seed=42)
            
        dirty['label'] = dataset.extract_label_for_prediction_task(dirty)

        dirty_priv, dirty_dis = dataset.partition_data_by(dirty, criteria)

        counts_priv = dict(dirty_priv.label.value_counts())
        counts_dis = dict(dirty_dis.label.value_counts())

        disparate = is_disparate(
            len(dirty_priv), val_or_zero(True, counts_priv), 
            len(dirty_dis), val_or_zero(True, counts_dis))
        
        mp_ratio_priv = val_or_zero(True, counts_priv) / len(dirty_priv)
        mp_ratio_dis = val_or_zero(True, counts_dis) / len(dirty_dis)

        mn_ratio_priv = val_or_zero(False, counts_priv) / len(dirty_priv)
        mn_ratio_dis = val_or_zero(False, counts_dis) / len(dirty_dis)

        if mp_ratio_priv > mp_ratio_dis:
            print(detector, dataset_name, criteria, 'mislabeled positive', 'priv', mp_ratio_priv, 'dis', mp_ratio_dis)
            print(detector, dataset_name, criteria, 'mislabeled negative', 'priv', mn_ratio_priv, 'dis', mn_ratio_dis)
            print('sig?', disparate)
        else:
            print(detector, dataset_name, criteria, 'mislabeled positive', 'priv', mp_ratio_priv, 'dis', mp_ratio_dis)
            print(detector, dataset_name, criteria, 'mislabeled negative', 'priv', mn_ratio_priv, 'dis', mn_ratio_dis)                        
            print("....Nope", detector, dataset_name, criteria)
        
        

cleanlab adult sex mislabeled positive priv 0.3180487804878049 dis 0.57543391188251
cleanlab adult sex mislabeled negative priv 0.6819512195121952 dis 0.42456608811748997
....Nope cleanlab adult sex
cleanlab adult race mislabeled positive priv 0.34414256500146073 dis 0.4527363184079602
cleanlab adult race mislabeled negative priv 0.6558574349985393 dis 0.5472636815920398
....Nope cleanlab adult race
cleanlab folktables sex mislabeled positive priv 0.24262105794905103 dis 0.38772019839233796
cleanlab folktables sex mislabeled negative priv 0.757378942050949 dis 0.612279801607662
....Nope cleanlab folktables sex
cleanlab folktables race mislabeled positive priv 0.2739285505466877 dis 0.3503381184007593
cleanlab folktables race mislabeled negative priv 0.7260714494533123 dis 0.6496618815992408
....Nope cleanlab folktables race
cleanlab heart sex mislabeled positive priv 0.5771688613477924 dis 0.5222457627118644
cleanlab heart sex mislabeled negative priv 0.4228311386522076 dis 0.477754237