In [1]:
from demodq.datasets import Datasets
from demodq.column_errors import detect_missing_values, detect_outliers_sd, detect_outliers_iqr
from demodq.tuple_errors import detect_mislabeled_via_cleanlab, detect_outliers_via_if, detect_mislabeled_via_shapley
from demodq.analysis import analyse_marked, detect_disparate_errors, is_disparate

import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np

def perc(frac):
    if frac == np.NAN: return frac
    return str(round(frac * 100, 1)) + '\%'

In [3]:
import pandas as pd

cases = [('adult', 'sex', 'race'), ('folktables', 'sex', 'race'), 
         ('heart', 'sex', 'age@35'), ('heart', 'sex', 'age@45'), ('heart', 'sex', 'age@55'),
         ('german', 'age', 'sex'), ('german', 'age', 'foreign_worker'), ('german', 'sex', 'foreign_worker')]

In [4]:
from prettytable import PrettyTable

In [5]:
frac_missing = PrettyTable()
frac_missing.field_names = ["dataset", "attr1", "attr2", "frac_priv_priv", "frac_priv_dis",
                            "frac_dis_priv", "frac_dis_dis"]

for dataset_name, attr1, attr2 in cases:
    dataset = Datasets.load(dataset_name)

    data = dataset.as_df().copy(deep=True)
    data['id'] = range(len(data))

    dirty_slices = []

    for column in dataset.categorical_columns + dataset.numerical_columns:
        dirty_slices.append(detect_missing_values(data, column))

    dirty = pd.concat(dirty_slices).drop_duplicates(subset='id')

    data_priv_priv, data_priv_nonpriv, data_nonpriv_priv, data_nonpriv_nonpriv = \
        dataset.partition_data_by(data, attr1, attr2)
    dirty_priv_priv, dirty_priv_nonpriv, dirty_nonpriv_priv, dirty_nonpriv_nonpriv = \
        dataset.partition_data_by(dirty, attr1, attr2)

    # disparate = is_disparate(len(data_priv), len(dirty_priv), len(data_nonpriv), len(dirty_nonpriv))
    
    frac_priv_priv = len(dirty_priv_priv) / len(data_priv_priv) if len(data_priv_priv) else np.NAN
    frac_priv_nonpriv = len(dirty_priv_nonpriv) / len(data_priv_nonpriv) if len(data_priv_nonpriv) else np.NAN
    frac_nonpriv_priv = len(dirty_nonpriv_priv) / len(data_nonpriv_priv) if len(data_nonpriv_priv) else np.NAN
    frac_nonpriv_nonpriv = len(dirty_nonpriv_nonpriv) / len(data_nonpriv_nonpriv) if len(data_nonpriv_nonpriv) else np.NAN

    frac_missing.add_row([dataset_name, attr1[:7], attr2[:7],
                          f"{perc(frac_priv_priv)} ({len(data_priv_priv)})",
                          f"{perc(frac_priv_nonpriv)} ({len(data_priv_nonpriv)})",
                          f"{perc(frac_nonpriv_priv)} ({len(data_nonpriv_priv)})",
                          f"{perc(frac_nonpriv_nonpriv)} ({len(data_nonpriv_nonpriv)})"])

print(frac_missing)

+------------+-------+---------+-----------------+----------------+-----------------+----------------+
|  dataset   | attr1 |  attr2  |  frac_priv_priv | frac_priv_dis  |  frac_dis_priv  |  frac_dis_dis  |
+------------+-------+---------+-----------------+----------------+-----------------+----------------+
|   adult    |  sex  |   race  |  6.0\% (28735)  | 10.4\% (3915)  |  8.8\% (13027)  | 11.2\% (3165)  |
| folktables |  sex  |   race  | 43.5\% (115136) | 45.9\% (71541) | 51.9\% (117473) | 51.4\% (74667) |
|   heart    |  sex  |  age@35 |  0.0\% (45527)  |   0.0\% (3)    |  0.0\% (24469)  |   0.0\% (1)    |
|   heart    |  sex  |  age@45 |  0.0\% (39163)  |  0.0\% (6367)  |  0.0\% (20565)  |  0.0\% (3905)  |
|   heart    |  sex  |  age@55 |  0.0\% (20579)  | 0.0\% (10674)  |  0.0\% (24951)  | 0.0\% (13796)  |
|   german   |  age  |   sex   |   34.4\% (605)  |  28.3\% (205)  |   21.2\% (85)   |  20.0\% (105)  |
|   german   |  age  | foreign |   23.5\% (34)   |  33.2\% (776)  |    33

In [6]:
frac_outliers = PrettyTable()
frac_outliers.field_names = ["dataset", "attr1", "attr2", "name",
                             "frac_priv_priv", "frac_priv_dis",
                             "frac_dis_priv", "frac_dis_dis"]

for dataset_name, attr1, attr2 in cases:
    dataset = Datasets.load(dataset_name)

    data = dataset.as_df().copy(deep=True)
    data['id'] = range(len(data))

    for detector, name in [(detect_outliers_sd, 'sd'), (detect_outliers_iqr, 'iqr')]:
        dirty_slices = []

        for column in dataset.numerical_columns:
            dirty_slices.append(detector(data, column))

        dirty = pd.concat(dirty_slices).drop_duplicates(subset='id')

        data_priv_priv, data_priv_nonpriv, data_nonpriv_priv, data_nonpriv_nonpriv = \
            dataset.partition_data_by(data, attr1, attr2)
        dirty_priv_priv, dirty_priv_nonpriv, dirty_nonpriv_priv, dirty_nonpriv_nonpriv = \
            dataset.partition_data_by(dirty, attr1, attr2)

        # disparate = is_disparate(len(data_priv), len(dirty_priv), len(data_nonpriv), len(dirty_nonpriv))

        frac_priv_priv = len(dirty_priv_priv) / len(data_priv_priv) if len(data_priv_priv) else np.NAN
        frac_priv_nonpriv = len(dirty_priv_nonpriv) / len(data_priv_nonpriv) if len(data_priv_nonpriv) else np.NAN
        frac_nonpriv_priv = len(dirty_nonpriv_priv) / len(data_nonpriv_priv) if len(data_nonpriv_priv) else np.NAN
        frac_nonpriv_nonpriv = len(dirty_nonpriv_nonpriv) / len(data_nonpriv_nonpriv) if len(data_nonpriv_nonpriv) else np.NAN

        frac_outliers.add_row([dataset_name, attr1[:7], attr2[:7], name,
                               f"{perc(frac_priv_priv)} ({len(data_priv_priv)})",
                               f"{perc(frac_priv_nonpriv)} ({len(data_priv_nonpriv)})",
                               f"{perc(frac_nonpriv_priv)} ({len(data_nonpriv_priv)})",
                               f"{perc(frac_nonpriv_nonpriv)} ({len(data_nonpriv_nonpriv)})"])

    # Isolation Forest
    dirty = detect_outliers_via_if(data, dataset, 1234)  # TODO: Try different seeds

    dirty_priv_priv, dirty_priv_nonpriv, dirty_nonpriv_priv, dirty_nonpriv_nonpriv = \
        dataset.partition_data_by(dirty, attr1, attr2)

    frac_priv_priv = len(dirty_priv_priv) / len(data_priv_priv) if len(data_priv_priv) else np.NAN
    frac_priv_nonpriv = len(dirty_priv_nonpriv) / len(data_priv_nonpriv) if len(data_priv_nonpriv) else np.NAN
    frac_nonpriv_priv = len(dirty_nonpriv_priv) / len(data_nonpriv_priv) if len(data_nonpriv_priv) else np.NAN
    frac_nonpriv_nonpriv = len(dirty_nonpriv_nonpriv) / len(data_nonpriv_nonpriv) if len(data_nonpriv_nonpriv) else np.NAN

    frac_outliers.add_row([dataset_name, attr1[:7], attr2[:7], 'if',
                           f"{perc(frac_priv_priv)} ({len(data_priv_priv)})",
                           f"{perc(frac_priv_nonpriv)} ({len(data_priv_nonpriv)})",
                           f"{perc(frac_nonpriv_priv)} ({len(data_nonpriv_priv)})",
                           f"{perc(frac_nonpriv_nonpriv)} ({len(data_nonpriv_nonpriv)})"])

print(frac_outliers)

+------------+-------+---------+------+----------------+----------------+----------------+----------------+
|  dataset   | attr1 |  attr2  | name | frac_priv_priv | frac_priv_dis  | frac_dis_priv  |  frac_dis_dis  |
+------------+-------+---------+------+----------------+----------------+----------------+----------------+
|   adult    |  sex  |   race  |  sd  | 8.1\% (28735)  |  6.5\% (3915)  | 5.0\% (13027)  |  3.8\% (3165)  |
|   adult    |  sex  |   race  | iqr  | 37.2\% (28735) | 32.4\% (3915)  | 40.5\% (13027) | 31.0\% (3165)  |
|   adult    |  sex  |   race  |  if  | 0.1\% (28735)  |  3.5\% (3915)  | 0.4\% (13027)  |  8.9\% (3165)  |
| folktables |  sex  |   race  |  sd  | 0.7\% (115136) | 0.6\% (71541)  | 0.3\% (117473) | 0.4\% (74667)  |
| folktables |  sex  |   race  | iqr  | 0.0\% (115136) | 0.0\% (71541)  | 0.0\% (117473) | 0.0\% (74667)  |
| folktables |  sex  |   race  |  if  | 0.3\% (115136) | 2.1\% (71541)  | 0.3\% (117473) | 2.2\% (74667)  |
|   heart    |  sex  |  age@

In [7]:
frac_outliers = PrettyTable()
frac_outliers.field_names = ["dataset", "attr1", "attr2", "name",
                             "frac_priv_priv", "frac_priv_dis",
                             "frac_dis_priv", "frac_dis_dis"]

for dataset_name, attr1, attr2 in cases:
    dataset = Datasets.load(dataset_name)

    data = dataset.as_df().copy(deep=True)
    data['id'] = range(len(data))

    for detector, name in [(detect_mislabeled_via_cleanlab, 'cl'), (detect_mislabeled_via_shapley, 'shap')]:
        dirty = detector(data, dataset, 1234)  # TODO: Try different seeds

        data_priv_priv, data_priv_nonpriv, data_nonpriv_priv, data_nonpriv_nonpriv = \
            dataset.partition_data_by(data, attr1, attr2)
        dirty_priv_priv, dirty_priv_nonpriv, dirty_nonpriv_priv, dirty_nonpriv_nonpriv = \
            dataset.partition_data_by(dirty, attr1, attr2)

        frac_priv_priv = len(dirty_priv_priv) / len(data_priv_priv) if len(data_priv_priv) else np.NAN
        frac_priv_nonpriv = len(dirty_priv_nonpriv) / len(data_priv_nonpriv) if len(data_priv_nonpriv) else np.NAN
        frac_nonpriv_priv = len(dirty_nonpriv_priv) / len(data_nonpriv_priv) if len(data_nonpriv_priv) else np.NAN
        frac_nonpriv_nonpriv = len(dirty_nonpriv_nonpriv) / len(data_nonpriv_nonpriv) if len(data_nonpriv_nonpriv) else np.NAN

        frac_outliers.add_row([dataset_name, attr1[:7], attr2[:7], name,
                               f"{perc(frac_priv_priv)} ({len(data_priv_priv)})",
                               f"{perc(frac_priv_nonpriv)} ({len(data_priv_nonpriv)})",
                               f"{perc(frac_nonpriv_priv)} ({len(data_nonpriv_priv)})",
                               f"{perc(frac_nonpriv_nonpriv)} ({len(data_nonpriv_nonpriv)})"])

print(frac_outliers)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


+------------+-------+---------+------+-----------------+----------------+----------------+----------------+
|  dataset   | attr1 |  attr2  | name |  frac_priv_priv | frac_priv_dis  | frac_dis_priv  |  frac_dis_dis  |
+------------+-------+---------+------+-----------------+----------------+----------------+----------------+
|   adult    |  sex  |   race  |  cl  |  10.0\% (28735) |  7.7\% (3915)  | 5.0\% (13027)  |  3.6\% (3165)  |
|   adult    |  sex  |   race  | shap |  22.7\% (28735) | 13.4\% (3915)  | 8.5\% (13027)  |  5.0\% (3165)  |
| folktables |  sex  |   race  |  cl  |  7.9\% (115136) | 6.4\% (71541)  | 6.6\% (117473) | 5.0\% (74667)  |
| folktables |  sex  |   race  | shap | 12.0\% (115136) | 6.9\% (71541)  | 9.4\% (117473) | 6.2\% (74667)  |
|   heart    |  sex  |  age@35 |  cl  |  22.7\% (45527) |   0.0\% (3)    | 23.3\% (24469) |   0.0\% (1)    |
|   heart    |  sex  |  age@35 | shap |  26.1\% (45527) |   0.0\% (3)    | 30.9\% (24469) |   0.0\% (1)    |
|   heart    |  sex