In [1]:
"""Evaluate preference button classifier on EU sites."""
# TODO: avoid copying from pref_btn_clf_notebook.py

import pandas as pd

from consent.cmp.prefbtn.pref_btn_clf import PrefBtnClf, predict_dataset
from consent.cmp.prefbtn.pref_btn_featurizer import featurize_attr_df, contain_pref_kw, contain_signature_id
from consent.data.eu_cookie_setting import EuCookieSetting
from consent.util.default_path import get_data_dir
from ooutil.file import file_empty
from consent.cmp.prefbtn.lang.americanize import americanize

pd.options.display.max_colwidth = None
pd.options.display.max_columns = None

In [2]:
pref_btn_dir = get_data_dir('2021-11-19') / 'pref_btn'
assert pref_btn_dir.exists()

In [3]:
anno_sites = set(EuCookieSetting.get_cookie_settings(nocache=True, pref_btn_only=True).site)

                    site                              opt_page consent_lib  \
33  insightexpressai.com               https://www.kantar.com/         NaN   
36                   NaN         https://www.metoffice.gov.uk/         NaN   
52                   NaN  https://www.tizianafausti.com/us_en/         NaN   
55                   NaN               https://www.twilio.com/         NaN   
58                   NaN           https://wearesocial.com/uk/         NaN   

   opt_setting no_cookie_setting_on_home_page  
33         NaN                           True  
36         NaN                           True  
52         NaN                           True  
55         NaN                           True  
58         NaN                           True  


In [4]:
# Get pref-btn features.
pref_data = []
for afile in pref_btn_dir.glob('*.csv'):
    if file_empty(afile):
        print(f'{afile} is empty')
        continue
    pref_data.append(pd.read_csv(afile, index_col=0))

pref_df = pd.concat(pref_data, axis=0, ignore_index=True)
pref_df = pref_df.dropna(subset={'tag_name'})
pref_df = pref_df.fillna('')

# Filter duplicate google sites.
# pref_df = pref_df[~pref_df['site'].str.contains('google')]

print("Number of sites:", pref_df.site.nunique())
print(f"Number of samples: {len(pref_df):,d}")
print("Number of pref buttons", len(pref_df[pref_df.pref_btn]))
pref_df.head()

Number of sites: 57
Number of samples: 10,614
Number of pref buttons 65


Unnamed: 0,id,class,tag_name,text_content,inner_text,aria_label,title,href,onclick,pref_btn,site
0,,,A,COVID-19,COVID-19,,,https://www.verywellhealth.com/coronavirus-news-4845451,,False,verywellhealth.com
1,,,A,End of Life Concerns,End of Life Concerns,,,https://www.verywellhealth.com/end-of-life-overview-4581973,,False,verywellhealth.com
2,,grid-nav-link,A,All Topics,All Topics,,,https://www.verywellhealth.com/health-a-z-4014770,,False,verywellhealth.com
3,,,A,Health Care,Health Care,,,https://www.verywellhealth.com/health-care-4014769,,False,verywellhealth.com
4,,link-list-link__deprecated tags-nav-link,A,Best OTC Flu Meds,Best OTC Flu Meds,,,https://www.verywellhealth.com/over-the-counter-cold-and-flu-medications-770618,,False,verywellhealth.com


In [5]:
miss_sites = anno_sites - set(pref_df.site)
print("Missing sites:", len(miss_sites), miss_sites)

Missing sites: 0 set()


In [6]:
for col in ['id', 'class']:
    pref_df[col] = pref_df[col].astype(str)

# def analyze_sites():
sites = pref_df.site.unique()

for site in sites:
    if len(pref_df[(pref_df.site == site) & (pref_df.pref_btn)]) == 0:
        print(f'no pref btn for {site}')
        break
pref_df[ (pref_df.inner_text.str.len() > 40) & (pref_df.pref_btn) ]
print("Number of sites in pref_df:", pref_df.site.nunique())

Number of sites in pref_df: 57


In [7]:
cols = ['site', 'id', 'class', 'inner_text', 'aria_label', 'title', 'pref_btn', 'onclick', 'href'] # , 'consent_lib']
pr_df = pref_df[cols]
# pr_df = pr_df[pr_df.consent_lib != 'first_party']
print("Number of sites in pr_df:", pr_df.site.nunique())

feat, feat_group_dims = featurize_attr_df(pr_df.copy(), americanize)
# feat, feat_group_dims = featurize_attr_df(pr_df, americanize)  # featurize pr_df to understand the featurization
print("Feature group dimensions:", feat_group_dims)

acc_group_dims = []
group_dim = 0
for dim in feat_group_dims:
    group_dim += dim
    acc_group_dims.append(group_dim)
acc_group_dims

Number of sites in pr_df: 57
Feature group dimensions: [12, 3, 2]


[12, 15, 17]

In [8]:
# TODO: move these to a util module.
def get_accuracy_top_n(adf, top_n):
    n_correct = 0
    sites = set(adf['site'].tolist())
    for site in sites:
        if pred_one_correct(adf[adf.site == site], top_n): # , top_n=top_n):
            n_correct += 1
    return n_correct / len(sites)

def pred_one_correct(adf: pd.DataFrame, top_n):
    adf = adf.sort_values(by=['proba'], ascending=False)[:top_n]
    # if 'google' in adf.site.unique()[0]: print(adf)
    correct = adf[adf['pref_btn'] == True]
    if len(correct) == 0:
        incorrect_dfs.append(adf)
    return len(correct) > 0

In [9]:
pd.set_option('display.expand_frame_repr', False)
pd.options.display.max_rows = None

incorrect_dfs = []
apr_df = pr_df.copy() # avoid warning
apr_df['inner_text'] = apr_df['inner_text'].map(lambda val: 'Customize' if val == 'Customise' else val)
predict_dataset(PrefBtnClf.get_clf(), apr_df, feat)
# apr_df['proba'] = apr_df.apply(lambda row: 1.0 if row['inner_text'].startswith('Customize') else row['proba'], axis=1)
accuracies = []
for top_n in [1,3,5,10]:
    accuracy = get_accuracy_top_n(apr_df, top_n=top_n)
    accuracies.append((top_n, accuracy))
print("Accuracy:")
for i, accu in accuracies:
    print(f"Top-{i}: {accu*100:.2f}%")
incorrect_df = pd.concat(incorrect_dfs)
incorrect_df

Load classifier from /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-21/pref_btn_clf.joblib
Accuracy:
Top-1: 77.19%
Top-3: 85.96%
Top-5: 85.96%
Top-10: 89.47%


Unnamed: 0,site,id,class,inner_text,aria_label,title,pref_btn,onclick,href,pred,proba
1656,greatist.com,,css-14ktbsh,MANAGE SETTINGS,,,False,,,False,0.276555
6673,n4g.com,,si-title si-title-40,Pumpkin Jack Review - Thumb Culture,,,False,,/news/2453706/pumpkin-jack-review-thumb-culture,False,0.030105
10399,babycenter.com,,,Do not sell my personal information,,,False,,/0_notice-to-california-consumers_40006872.bc,False,0.303162
9020,innovid.com,,link-w-caret text-gray-dark text-lg sm:text-xl flex items-center transition hover:text-orange-medium font-sans font-bold text-lg sm:text-xl pb-0 leading-tight mr-3 mr-12 lg:mr-16 xl:mr-0,Learn more,,,False,,https://www.innovid.com/resource/innovid-to-become-publicly-listed-via-a-merger-with-ion-acquisition-corp-2-ltd/,False,0.030105
662,news.com.au,,storyblock_label g_font-base-s updated,UPDATED,,,False,,,False,0.462668
1109,justpremium.com,,custom-link btn border-width-0 btn-arrow btn-pink btn-default btn-icon-right,Learn More,,,False,,http://demo.justpremium.com/,False,0.030105
4743,buzzsumo.com,,c-simple-link-light db py1 f2,Cookie Preferences,,,False,,https://buzzsumo.com/legal/cookies-policy,True,0.861916
7139,audiojungle.net,,shared-cta_button_component__root shared-cta_button_component__large,View all categories,,,False,,/category/all,False,0.030105
5340,one.com,,,Cookie Policy,,,False,,https://www.one.com/en/info/cookie-policy,False,0.276555
9056,google.com.mx,,pHiOh,Privacy,,,False,,https://policies.google.com/privacy?hl=en-GB&fg=1,False,0.030105


In [None]:
# pr_df[(pr_df.site == 'daimler.com') & (pr_df.inner_text == 'Settings')]
# pr_df[(pr_df['id'].str.contains('onetrust'))]
pr_df[(pr_df.site.str.contains("google"))]

Unnamed: 0,site,id,class,inner_text,aria_label,title,pref_btn,onclick,href
4657,google.be,,jyfHyd,Customise,,,True,,
4658,google.be,,pHiOh,Carbon neutral since 2007,,,False,,https://sustainability.google/commitments-europe/?utm_source=googlehpfooter&utm_medium=housepromos&utm_campaign=bottom-footer&utm_content=
4659,google.be,,gb_3 gb_4 gb_3d gb_3c,Sign in,,,False,,https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.be/&ec=GAZAmgQ
4660,google.be,,MV3Tnb,Store,,,False,,https://store.google.com/GB?utm_source=hp_header&utm_medium=google_ooo&utm_campaign=GS100042&hl=en-GB
4661,google.be,,pHiOh,Privacy,,,False,,https://policies.google.com/privacy?hl=en-GB&fg=1
4662,google.be,,gb_f,Images,,,False,,https://www.google.co.uk/imghp?hl=en&ogbl
4663,google.be,,MV3Tnb,About,,,False,,https://about.google/?fg=1&utm_source=google-GB&utm_medium=referral&utm_campaign=hp-header
4664,google.be,,gb_C,,Google apps,,False,,https://www.google.co.uk/intl/en/about/products
4665,google.be,,YacQv gsfi,,,,False,,
4666,google.be,,uU7dJb,United Kingdom,,,False,,


In [11]:
contain_pref_kw(['customize'])
row = pr_df.loc[3347]
print(row)
contain_signature_id(row)

site                          daimler.com
id                                       
class                                    
inner_text                      Locations
aria_label                               
title                                    
pref_btn                            False
onclick                                  
href          /career/about-us/locations/
Name: 3347, dtype: object


0