In [47]:
"""Analyze cookie preference menu."""

import pandas as pd

from consent.util.default_path import get_data_dir
from ooutil.df_util import read_jsons_in_dir


data_dir = get_data_dir('2021-03-18') / 'pref_menu_scan'
assert data_dir.exists()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

In [48]:
prefs = read_jsons_in_dir(data_dir)
prefs.head()

Read 2997 files.


Unnamed: 0,domain,pref_menu_elem,failed,pref_menu_type,category,cookie_check,failed_reason
0,home.blog,,True,,,,
1,python.ca,,True,,,,
2,elitedaily.com,,True,,,,
3,iata.org,,True,,,,
4,jagranjosh.com,,True,,,,


In [49]:
category_data = []
for _, row in prefs.dropna(subset=['category']).iterrows():
    for category in row['category']:
        category['domain'] = row['domain']
        category_data.append(category)
categories = pd.DataFrame(category_data)

In [50]:
print("Num successfully extracted: ", categories.domain.nunique())

Num successfully extracted:  184


In [51]:
prefs[~prefs.category.isna()].reset_index().head()

Unnamed: 0,index,domain,pref_menu_elem,failed,pref_menu_type,category,cookie_check,failed_reason
0,9,hindawi.com,onetrust,,tab,"[{'id': 'C0001', 'name': 'Strictly Necessary Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'hindawi.com'}, {'id': 'C0002', 'name': 'Performance Cookies', 'cur_status': 'true', 'next_status': 'false', 'domain': 'hindawi.com'}, {'id': 'C0004', 'name': 'Targeting Cookies', 'cur_status': 'true', 'next_status': 'false', 'domain': 'hindawi.com'}]",True,
1,18,nflxvideo.net,onetrust,,tab,"[{'id': 'C0001', 'name': 'Essential Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'nflxvideo.net'}, {'id': 'C0002', 'name': 'Performance and Functionality Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'nflxvideo.net'}, {'id': 'C0004', 'name': 'Advertising Cookies', 'cur_status': 'true', 'next_status': 'false', 'domain': 'nflxvideo.net'}]",True,
2,26,bitnami.com,onetrust,,tab,"[{'id': 'C0001', 'name': 'Strictly Necessary', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'bitnami.com'}, {'id': 'C0002', 'name': 'Performance', 'cur_status': 'true', 'next_status': 'false', 'domain': 'bitnami.com'}, {'id': 'C0004', 'name': 'Advertising', 'cur_status': 'true', 'next_status': 'false', 'domain': 'bitnami.com'}, {'id': 'C0005', 'name': 'Social Media', 'cur_status': 'true', 'next_status': 'false', 'domain': 'bitnami.com'}]",True,
3,35,mailchi.mp,onetrust,,tab,"[{'id': '1', 'name': 'Essential Website Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'mailchi.mp'}, {'id': '2', 'name': 'Performance and Functionality Cookies', 'cur_status': 'false', 'next_status': 'false', 'domain': 'mailchi.mp'}, {'id': '4', 'name': 'Advertising (Targeting) Cookies', 'cur_status': 'false', 'next_status': 'false', 'domain': 'mailchi.mp'}, {'id': '3', 'name': 'Analytics and Customization Cookies', 'cur_status': 'false', 'next_status': 'false', '...",True,
4,39,illinois.edu,onetrust,,accordion,"[{'id': 'C0001', 'name': 'Strictly Necessary Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'illinois.edu'}, {'id': 'C0002', 'name': 'Performance Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'illinois.edu'}, {'id': 'C0003', 'name': 'Functional Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domain': 'illinois.edu'}, {'id': 'C0004', 'name': 'Targeting Cookies', 'cur_status': 'always_active', 'next_status': 'true', 'domai...",True,


In [52]:
categories.head()

Unnamed: 0,id,name,cur_status,next_status,domain
0,C0001,Strictly Necessary Cookies,always_active,True,hindawi.com
1,C0002,Performance Cookies,true,False,hindawi.com
2,C0004,Targeting Cookies,true,False,hindawi.com
3,C0001,Essential Cookies,always_active,True,nflxvideo.net
4,C0002,Performance and Functionality Cookies,always_active,True,nflxvideo.net


In [53]:
n_domains = categories.domain.nunique()
def all_choices_always_active(adf):
    return adf.cur_status.nunique() == 1 and adf.cur_status.unique()[0] == 'always_active'
all_actives = []

def get_all_choices_aa_domains():
    grouped = categories.groupby('domain')
    for name, group in grouped:
        if all_choices_always_active(group):
            yield name
all_choices_aa_domains = list(get_all_choices_aa_domains())
n_all_aa = len(all_choices_aa_domains)
print(f"Num domains that have all-choice-always-active: {n_all_aa} {n_all_aa / n_domains * 100:.2f}%")
all_choices_aa_domains[:3]

Num domains that have all-choice-always-active: 52 28.26%


['al.com', 'allure.com', 'architecturaldigest.com']

In [54]:
pref_vc = prefs.pref_menu_elem.value_counts()
pref_vc

onetrust           257
onetrust_legacy     23
cookiebot           11
quantcast            8
sourcepoint          6
trustarc             2
Name: pref_menu_elem, dtype: int64

In [55]:
n_detects = categories.domain.nunique()
n_sdk = pref_vc['onetrust'] + pref_vc['onetrust_legacy']
print(f"Num extracted over all detected onetrust: {n_detects}/{n_sdk} {n_detects/n_sdk*100:.2f}%", )

Num extracted over all detected onetrust: 184/280 65.71%


In [57]:
prefs[(prefs.category.isna()) & (prefs.pref_menu_elem == 'onetrust')].reset_index().head(50)

Unnamed: 0,index,domain,pref_menu_elem,failed,pref_menu_type,category,cookie_check,failed_reason
0,48,hypebeast.com,onetrust,True,,,,
1,55,nydailynews.com,onetrust,True,,,,
2,78,crunchyroll.com,onetrust,True,,,,
3,90,redbull.com,onetrust,True,,,,
4,95,cc.com,onetrust,True,,,,
5,118,azlyrics.com,onetrust,True,,,,
6,133,codecademy.com,onetrust,True,,,,
7,143,prnewswire.com,onetrust,True,,,,
8,174,sheknows.com,onetrust,True,,,,
9,186,247sports.com,onetrust,True,,,,
