In [1]:
import logging
import warnings

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

warnings.filterwarnings("ignore")

## Execute a pipeline and setup a view generator

In [2]:
from freamon.adapters.mlinspect.provenance import from_py_file
view_generator = from_py_file('pipelines--mlinspect--amazon-reviews.py')

INFO:root:Patching sys.argv with ['eyes']
INFO:root:Registering source 2 with columns: ['product_id', 'product_parent', 'product_title', 'category_id', 'mlinspect_lineage_2_0']
INFO:root:
                  CREATE OR REPLACE VIEW _freamon_source_2_with_prov_view AS 
                  SELECT 
                  "product_id" AS "product_id", "product_parent" AS "product_parent", "product_title" AS "product_title", "category_id" AS "category_id", "mlinspect_lineage_2_0" AS "prov_id_source_2"
                  FROM _freamon_source_2
                
INFO:root:Registering source 3 with columns: ['id', 'category', 'mlinspect_lineage_3_0']
INFO:root:
                  CREATE OR REPLACE VIEW _freamon_source_3_with_prov_view AS 
                  SELECT 
                  "id" AS "id", "category" AS "category", "mlinspect_lineage_3_0" AS "prov_id_source_3"
                  FROM _freamon_source_3
                
INFO:root:Registering source 1 with columns: ['review_id', 'star_rating', 'helpful_v

Test accuracy 0.8784434598343287


## Internal view over features, labels and predictions with provenance-based FKs to input tables

In [3]:
view_generator.db.execute("SELECT * FROM _freamon_test_view").df()

Unnamed: 0,features,prov_id_source_0,prov_id_source_1,prov_id_source_2,prov_id_source_3,y_true,y_pred
0,[-1.00789514e-03 -1.15866240e+00 1.00000000e+...,0,46881,0,0,0,0
1,[0.02370451 0.76846785 1. ... 0.123091...,453,74300,0,0,0,0
2,[-0.07514512 0.76846785 1. ... 0. ...,2,51654,2,0,1,1
3,[-1.00789514e-03 -1.15866240e+00 1.00000000e+...,1202,28309,2,0,1,0
4,[-0.07514512 0.76846785 1. ... 0. ...,4853,42667,2,0,1,1
...,...,...,...,...,...,...,...
25950,[-0.0257203 -1.80103915 1. ... 0. ...,63663,61853,3600,0,1,1
25951,[-0.07514512 -1.1586624 1. ... -0.09...,63703,85708,5605,0,1,1
25952,[-0.0010079 0.76846785 1. ... 0. ...,63747,46227,3240,0,0,0
25953,[-0.07514512 -1.1586624 1. ... 0. ...,63775,21482,5608,0,1,1


## Generate and materialize a view for data debugging

In [7]:
df = view_generator.test_view(
    sliceable_by=['category', 'star_rating'], 
    with_features=False, 
    with_y_true=True, 
    with_y_pred=True)

df

Unnamed: 0,category,star_rating,y_true,y_pred
0,Digital_Video_Games,5,1,1
1,Digital_Video_Games,5,1,1
2,Digital_Video_Games,5,1,1
3,Digital_Video_Games,3,1,1
4,Digital_Video_Games,5,1,1
...,...,...,...,...
25950,Digital_Video_Games,1,1,1
25951,Digital_Video_Games,5,1,1
25952,Digital_Video_Games,5,1,1
25953,Digital_Video_Games,5,1,1


## Data-debugging a'la SliceFinder via an aggregation query

In [9]:
view_generator.execute_query(
"""
SELECT 
    star_rating > 3 as top_rated,
    category = 'Digital_Video_Games' as digi_games,
    AVG(-(y_true * log(y_pred_proba) + (1 - y_true) * log(1.0 - y_pred_proba))) AS avg_loss,
    VARIANCE(-(y_true * log(y_pred_proba) + (1 - y_true) * log(1.0 - y_pred_proba))) AS var_loss,    
    COUNT(*) as size
    
FROM (SELECT star_rating, category, y_true, IF(y_pred=0, 0.00001, 0.99999) AS y_pred_proba FROM df)
GROUP BY GROUPING SETS ((star_rating > 3, category = 'Digital_Video_Games'), (star_rating > 3), 
    (category = 'Digital_Video_Games'))
"""
)

Unnamed: 0,top_rated,digi_games,avg_loss,var_loss,size
0,True,True,0.465961,2.11278,18800
1,False,True,0.980437,3.941461,7155
2,True,,0.465961,2.11278,18800
3,False,,0.980437,3.941461,7155
4,,True,0.607787,2.669612,25955
