In [1]:
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# Configure jupysql to return data as a Pandas dataframe and have less verbose output
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect jupysql to in-memory DuckDB database
%sql duckdb:///:memory:

In [25]:
import duckdb

In [6]:
from demodq.analysis import is_disparate

In [49]:
filtered_errors = %sql SELECT * FROM cleanml_errors.csv WHERE (attr2 != 'age@55') AND NOT (dataset = 'german' AND attr1='sex')

In [50]:
%sql SELECT * from cleanml_groups.csv

Unnamed: 0,dataset,attr1,attr2,pp_count,pd_count,dp_count,dd_count
0,adult,sex,race,28735,3915,13027,3165
1,folktables,sex,race,115136,71541,117473,74667
2,heart,sex,age@45,39163,6367,20565,3905
3,heart,sex,age@55,20579,10674,24951,13796
4,german,age,sex,605,205,85,105
5,german,age,foreign_worker,34,776,3,187
6,german,sex,foreign_worker,30,660,7,303


In [51]:
errors_and_counts = %sql SELECT f.dataset, f.attr1, f.attr2, error_type, pp_dirty, dd_dirty, pp_count, dd_count \
FROM filtered_errors AS f, cleanml_groups.csv AS g \
WHERE f.dataset = g.dataset AND f.attr1=g.attr1 AND f.attr2=g.attr2

In [52]:
errors_and_counts

Unnamed: 0,dataset,attr1,attr2,error_type,pp_dirty,dd_dirty,pp_count,dd_count
0,adult,sex,race,missing-values,1715,353,28735,3165
1,folktables,sex,race,missing-values,50027,38371,115136,74667
2,heart,sex,age@45,missing-values,0,0,39163,3905
3,german,age,sex,missing-values,208,21,605,105
4,german,age,foreign_worker,missing-values,8,38,34,187
5,adult,sex,race,outliers-sd,2321,119,28735,3165
6,adult,sex,race,outliers-iqr,10694,982,28735,3165
7,adult,sex,race,outliers-if,16,281,28735,3165
8,folktables,sex,race,outliers-sd,801,312,115136,74667
9,folktables,sex,race,outliers-iqr,0,0,115136,74667


In [54]:
for et in ['missing-values', 'outliers-sd', 'outliers-iqr', 'outliers-if', 'mislabels-cl']:
    
    ratios_pp = []
    ratios_dd = []
    
    counts = errors_and_counts[errors_and_counts.error_type==et]
    for _, row in counts.iterrows():
        dataset = row['dataset']
        attr1 = row['attr1']
        attr2 = row['attr2']
        error_type = row['error_type']

        num_pp = row['pp_count']
        num_dd = row['dd_count']
        dirty_pp = row['pp_dirty']
        dirty_dd = row['dd_dirty']

        #disparate = is_disparate(len(data_priv), len(dirty_priv), len(data_nonpriv), len(dirty_nonpriv))
        disparate = is_disparate(num_pp, dirty_pp , num_dd, dirty_dd)

        if disparate:
            ratios_pp.append(dirty_pp/num_pp * 100)
            ratios_dd.append(dirty_dd/num_dd * 100)            
        else:
            ratios_pp.append(0.0)
            ratios_dd.append(0.0)     
        print('#', dataset, attr1, attr2, error_type, disparate)

    print('#----')  
    print(f"{et.replace('-', '_')}_pp", '=', ratios_pp)
    print(f"{et.replace('-', '_')}_dd", '=', ratios_dd)
    print('#----')      
    

# adult sex race missing-values True
# folktables sex race missing-values True
# heart sex age@45 missing-values False
# german age sex missing-values True
# german age foreign_worker missing-values False
#----
missing_values_pp = [5.9683313032886725, 43.4503543635353, 0.0, 34.3801652892562, 0.0]
missing_values_dd = [11.153238546603475, 51.38950272543427, 0.0, 20.0, 0.0]
#----
# adult sex race outliers-sd True
# folktables sex race outliers-sd True
# heart sex age@45 outliers-sd True
# german age sex outliers-sd False
# german age foreign_worker outliers-sd False
#----
outliers_sd_pp = [8.077257699669392, 0.6956989994441356, 10.849526338635958, 0.0, 0.0]
outliers_sd_dd = [3.7598736176935232, 0.4178552774317972, 6.837387964148528, 0.0, 0.0]
#----
# adult sex race outliers-iqr True
# folktables sex race outliers-iqr False
# heart sex age@45 outliers-iqr True
# german age sex outliers-iqr True
# german age foreign_worker outliers-iqr True
#----
outliers_iqr_pp = [37.21593875065252, 0.0, 2