In [1]:
import logging
import warnings

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

warnings.filterwarnings("ignore")

## Execute a pipeline and setup a view generator

In [2]:
from freamon.adapters.mlinspect.provenance import from_py_file
view_generator = from_py_file('pipelines--mlinspect--amazon-reviews.py')

INFO:root:Patching sys.argv with ['eyes']
INFO:root:Executing instrumented user pipeline with mlinspect
INFO:root:Redirecting the pipeline's stdout to pipeline-output.txt
INFO:root:---RUNTIME: Instrumented execution took 11119.127035140991 ms
INFO:root:Registering test source 2 with columns: ['product_id', 'product_parent', 'product_title', 'category_id', 'mlinspect_lineage_2_0']
INFO:root:
                  CREATE OR REPLACE VIEW _freamon_test_source_2_with_prov_view AS 
                  SELECT 
                  "product_id" AS "product_id", "product_parent" AS "product_parent", "product_title" AS "product_title", "category_id" AS "category_id", "mlinspect_lineage_2_0" AS "prov_id_source_2"
                  FROM _freamon_test_source_2
                
INFO:root:Registering test source 3 with columns: ['id', 'category', 'mlinspect_lineage_3_0']
INFO:root:
                  CREATE OR REPLACE VIEW _freamon_test_source_3_with_prov_view AS 
                  SELECT 
                  "i

In [None]:
view_generator.db.execute("SELECT * FROM _freamon_test_view").df()

In [None]:
view_generator.db.execute("SELECT * FROM _freamon_source_0_with_prov_view").df()

In [None]:
view_generator.db.execute("SELECT * FROM _freamon_source_1_with_prov_view").df()

In [None]:
view_generator.db.execute("SELECT * FROM _freamon_source_2_with_prov_view").df()

In [None]:
view_generator.db.execute("SELECT * FROM _freamon_source_3_with_prov_view").df()

## Generate and materialize a view for data debugging

In [5]:
df = view_generator.test_view(
    sliceable_by=['category', 'marketplace'], 
    with_features=False, 
    with_y_true=True, 
    with_y_pred=True)

df

INFO:root:
SELECT fs3.category, fs0.marketplace, ftv.y_true, ftv.y_pred
FROM _freamon_test_view ftv
JOIN _freamon_test_source_0_with_prov_view fs0  ON fs0.prov_id_source_0 = ftv.prov_id_source_0
JOIN _freamon_test_source_3_with_prov_view fs3  ON fs3.prov_id_source_3 = ftv.prov_id_source_3
   
        


Unnamed: 0,category,marketplace,y_true,y_pred
0,Digital_Video_Games,US,0,0
1,Digital_Video_Games,US,1,1
2,Digital_Video_Games,US,1,1
3,Digital_Video_Games,US,1,1
4,Digital_Video_Games,US,1,1
...,...,...,...,...
25950,Digital_Video_Games,US,1,1
25951,Digital_Video_Games,US,1,1
25952,Digital_Video_Games,US,0,0
25953,Digital_Video_Games,US,1,1


## Compute group-wise confusion matrix (for fairness metrics) via an aggregation query

In [None]:
view_generator.query(
"""
SELECT 
    race=='White' AS privileged, 
    SUM(CAST((y_true=1 AND y_pred=1) AS INTEGER)) AS true_positive,
    SUM(CAST((y_true=1 AND y_pred=0) AS INTEGER)) AS false_negative,    
    SUM(CAST((y_true=0 AND y_pred=1) AS INTEGER)) AS false_positive,    
    SUM(CAST((y_true=0 AND y_pred=0) AS INTEGER)) AS true_negative,    
FROM df
GROUP BY privileged
"""
)

## Slicefinder via aggregation queries

In [None]:
view_generator.query(
"""
SELECT 
    race='White' AS white,
    sex='Male' AS male,
    AVG(-(y_true * log(y_pred_proba) + (1 - y_true) * log(1.0 - y_pred_proba))) AS avg_loss,
    VARIANCE(-(y_true * log(y_pred_proba) + (1 - y_true) * log(1.0 - y_pred_proba))) AS var_loss,    
    COUNT(*) as size
    
FROM (SELECT race, sex, y_true, IF(y_pred=0, 0.00001, 0.99999) AS y_pred_proba FROM df)
GROUP BY GROUPING SETS ((race='White', sex='Male'), (race='White'), (sex='Male'))
"""
)