In [1]:
import logging
import warnings

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

warnings.filterwarnings("ignore")

## Execute a pipeline and setup a view generator

In [2]:
from freamon.adapters.mlinspect.provenance import from_py_file
view_generator = from_py_file('pipelines--mlinspect--credit.py')

INFO:root:Patching sys.argv with ['eyes']
INFO:root:Registering source 0 with columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year', 'mlinspect_lineage_0_0']
INFO:root:
                  CREATE OR REPLACE VIEW _freamon_source_0_with_prov_view AS 
                  SELECT 
                  "age" AS "age", "workclass" AS "workclass", "fnlwgt" AS "fnlwgt", "education" AS "education", "education-num" AS "education-num", "marital-status" AS "marital-status", "occupation" AS "occupation", "relationship" AS "relationship", "race" AS "race", "sex" AS "sex", "capital-gain" AS "capital-gain", "capital-loss" AS "capital-loss", "hours-per-week" AS "hours-per-week", "native-country" AS "native-country", "income-per-year" AS "income-per-year", "mlinspect_lineage_0_0" AS "prov_id_source_0"
                  FROM _freamon_source_0
       

Model accuracy on held-out data 0.818609022556391


## Generate and materialize a view for data debugging

In [3]:
df = view_generator.test_view(
    sliceable_by=['race', 'sex'], 
    with_features=False, 
    with_y_true=True, 
    with_y_pred=True)

df

INFO:root:
SELECT fs0.race, fs0.sex, ftv.y_true, ftv.y_pred
FROM _freamon_test_view ftv
JOIN _freamon_source_0_with_prov_view fs0  ON fs0.prov_id_source_0 = ftv.prov_id_source_0
   
        


RuntimeError: Binder Error: Values list "ftv" does not have a column named "prov_id_source_0"
LINE 4: ....y_true, ftv.y_pred
FROM _freamon_test_view ftv
JOIN _freamon_source_0_with_prov_view fs0  ON fs0.prov_id_source_0 = ftv.prov_id_source_0
                                                  ^

## Compute group-wise confusion matrix (for fairness metrics) via an aggregation query

In [None]:
view_generator.query(
"""
SELECT 
    race=='White' AS privileged, 
    SUM(CAST((y_true=1 AND y_pred=1) AS INTEGER)) AS true_positive,
    SUM(CAST((y_true=1 AND y_pred=0) AS INTEGER)) AS false_negative,    
    SUM(CAST((y_true=0 AND y_pred=1) AS INTEGER)) AS false_positive,    
    SUM(CAST((y_true=0 AND y_pred=0) AS INTEGER)) AS true_negative,    
FROM df
GROUP BY privileged
"""
)

## Slicefinder via aggregation queries

In [None]:
view_generator.query(
"""
SELECT 
    race='White' AS white,
    sex='Male' AS male,
    AVG(-(y_true * log(y_pred_proba) + (1 - y_true) * log(1.0 - y_pred_proba))) AS avg_loss,
    VARIANCE(-(y_true * log(y_pred_proba) + (1 - y_true) * log(1.0 - y_pred_proba))) AS var_loss,    
    COUNT(*) as size
    
FROM (SELECT race, sex, y_true, IF(y_pred=0, 0.00001, 0.99999) AS y_pred_proba FROM df)
GROUP BY GROUPING SETS ((race='White', sex='Male'), (race='White'), (sex='Male'))
"""
)