In [1]:
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from consent.consistency.comply_util import site_to_contras, get_comply
from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import get_scan_root_dir, get_scan_dirs # , FIG_DIR uncomment to save fig.
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_scans
from consent.data.pref_menu_scan.har_cookie_reader import read_postrej_sent_cookies_in_scans

cur_set = '0k_20k' # '20k_100k'
overwrite = True
SCAN_ROOT_DIR = get_scan_root_dir('capetown')
SCAN_DIRS = [SCAN_ROOT_DIR / f'pref_menu_scan_{cur_set}']
# SCAN_DIRS = get_scan_dirs('us')[:]
output_suffix = '_' + cur_set # '0k_20k'  # '20k_100k' #'60k_80k' # '40k_60k' # # '100k_200k'; done: '20k_40k' '0k_20k'
print('Scan root dir:', SCAN_ROOT_DIR)
print('Output suffix:', output_suffix)
print('Scan dirs:', SCAN_DIRS)

Scan root dir: /mnt/sdd/ducbui/projects/data2/consent/2024-10-12
Output suffix: _0k_20k
Scan dirs: [PosixPath('/mnt/sdd/ducbui/projects/data2/consent/2024-10-12/pref_menu_scan_0k_20k')]


In [2]:
# Get cookie prefs = cookie decls + prefs
cookie_prefs_file = SCAN_ROOT_DIR / f'cookie_prefs{output_suffix}.parquet'

if not cookie_prefs_file.exists() or overwrite:
    save_cookie_decls_file = SCAN_ROOT_DIR / f'cookie_decls{output_suffix}.parquet'
    print(f'To save to {save_cookie_decls_file}')
    cookie_prefs = read_cookie_prefs_in_scans(SCAN_DIRS, save_cookie_decls_file)  # took 2.3min for top 50k-site scan
    cookie_prefs.to_parquet(cookie_prefs_file); print(f'Written to {cookie_prefs_file}')
else:
    cookie_prefs = pd.read_parquet(cookie_prefs_file)
cookie_prefs.head()
# TODO: add log_file_checker

To save to /mnt/sdd/ducbui/projects/data2/consent/2024-10-12/cookie_decls_0k_20k.parquet


Written to /mnt/sdd/ducbui/projects/data2/consent/2024-10-12/cookie_decls_0k_20k.parquet
Written to /mnt/sdd/ducbui/projects/data2/consent/2024-10-12/cookie_prefs_0k_20k.parquet


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,.AxcmXFaxAuth,fax.nextiva.com,0 days,C0001,Strictly Necessary Cookies,always active,nextiva.com,onetrust,en.json,True
1,__cf_bm,nextiva.com,0 days,C0001,Strictly Necessary Cookies,always active,nextiva.com,onetrust,en.json,True
2,_ce.cch,nextiva.com,Session,C0001,Strictly Necessary Cookies,always active,nextiva.com,onetrust,en.json,True
3,ao,sso.nextiva.com,0 days,C0001,Strictly Necessary Cookies,always active,nextiva.com,onetrust,en.json,True
4,autolaunch_triggered,sso.nextiva.com,0 days,C0001,Strictly Necessary Cookies,always active,nextiva.com,onetrust,en.json,True


In [None]:
print("Found cookie libraries:")
cookie_prefs[['site', 'lib_name']].drop_duplicates().lib_name.value_counts()

Found cookie libraries:


lib_name
onetrust     1266
cookiebot     102
Name: count, dtype: int64

In [None]:
cookielist_sites = sorted(cookie_prefs.site.unique().tolist())
n_cookielist_sites = len(cookielist_sites)
assert cookie_prefs.site.nunique() == n_cookielist_sites
n=10
print("Sites with cookie prefs:", n_cookielist_sites, f', first {n} sites:' , cookielist_sites[:n])

Sites with cookie prefs: 1368 , first 10 sites: ['247sports.com', '2k.com', '2ksports.com', 'aao.org', 'aber.ac.uk', 'abercrombie.com', 'acast.com', 'accedo.tv', 'accenture.com', 'accor.com']


In [5]:
site_to_libname = {row['site']: row['lib_name'] for row in cookie_prefs[['site', 'lib_name']].drop_duplicates().to_dict('records')}

In [6]:
# Quick checking the cookie declaration naming
print(len(cookie_prefs[cookie_prefs.name.str.endswith('#')]) / len(cookie_prefs))
print(len(cookie_prefs[cookie_prefs.name.str.endswith('xxx')]) / len(cookie_prefs))
print(len(cookie_prefs[cookie_prefs.name.str.endswith('XXX')]) / len(cookie_prefs))
cookie_prefs[cookie_prefs.name.str.endswith('#')]

0.011726616589983684


0.029156683179232385
0.002781901835981714


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
101,visitor_id#####,go.nextiva.com,364 days,C0004,Targeting Cookies,active,nextiva.com,onetrust,en.json,False
24,_gd#############,j.6sc.co,730 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False
30,_gd#############,www.wework.com,730 days,C0003,Functional Cookies,inactive,wework.com,onetrust,en.json,False
34,_gd#############,cdnjs.cloudflare.com,730 days,C0003,Functional Cookies,inactive,wework.com,onetrust,en.json,False
44,AMP_unsent_#,commercial.cdn.aws.placeit.net,Persistent,statistics,statistics,,placeit.net,cookiebot,cc.js,False
...,...,...,...,...,...,...,...,...,...,...
140,visitor_id#####,click.outreach.io,3649 days,4,Targeting Cookies,active,outreach.io,onetrust,en.json,False
147,visitor_id#####,pi.pardot.com,3649 days,4,Targeting Cookies,active,outreach.io,onetrust,en.json,False
170,visitor_id#####,pardot.com,3649 days,4,Targeting Cookies,active,outreach.io,onetrust,en.json,False
172,_gd#############,j.6sc.co,729 days,4,Targeting Cookies,active,outreach.io,onetrust,en.json,False


In [7]:
# # Distribution of errors
# print("Libs in failed-to-crawl sites")
# errors.consent_lib_name.value_counts()
# from ooutil.df_util import expand_dict_col
# ot_errs = errors[errors.consent_lib_name == 'onetrust']
# errors_dict = pd.DataFrame(ot_errs.error.tolist()).explode('reason')
# # Uncomment to show
# # errors_dict.reason.value_counts()

# Main analysis

In [8]:
# import sys; import importlib; importlib.reload(sys.modules['consent.data.pref_menu_scan.postrej_cookie_reader'])
from consent.data.pref_menu_scan.har_cookie_reader import read_postrej_sent_cookies_in_scans

cookies_cache_file = SCAN_ROOT_DIR / f'scan{output_suffix}.parquet'  # 'raw_postrej_sent_cookies.parquet'

if not cookies_cache_file.exists() or overwrite:
    sent_cookies = read_postrej_sent_cookies_in_scans(SCAN_DIRS)
    if cookies_cache_file: sent_cookies.to_parquet(cookies_cache_file); print(f"Written to {cookies_cache_file}")
else:
    sent_cookies = pd.read_parquet(cookies_cache_file)

print(f"Number sent cookies read: {len(sent_cookies):,d}")
sent_cookies.head(3)

Unmatch url and consent cookie domain: page_url='https://www.wework.cn/zh-CN/city/shenzhen/?lang=en-us' consent_cookie.cookie['domain']='.www.wework.com' consent_cookie.cookie['path']='/' wework.com/postrej_2.har.xz
Unmatch url and consent cookie domain: page_url='https://refer.wework.com/' consent_cookie.cookie['domain']='.www.wework.com' consent_cookie.cookie['path']='/' wework.com/postrej_3.har.xz
Unmatch url and consent cookie domain: page_url='https://authn.sd00.worldcat.org/wayf/metaauth-ui/cmnd/protocol/samlpost' consent_cookie.cookie['domain']='.oclc.org' consent_cookie.cookie['path']='/' oclc.org/postrej_4.har.xz
Post rej cookie not contain consent cookie, renfe.com/postrej_0.har.xz not have cookie OptanonConsent of .renfe.com on path /
Unmatch url and consent cookie domain: page_url='http://elle.hearstmobile.com/Sweepstakes/' consent_cookie.cookie['domain']='.elle.com' consent_cookie.cookie['path']='/' elle.com/postrej_4.har.xz
Post rej cookie not contain consent cookie, skri

Unnamed: 0,name,value,domain,path,expires,size,httpOnly,secure,session,sameSite,priority,sameParty,sourceScheme,sourcePort,request_url,site,page_url,partitionKey
0,IDE,AHWqTUnn-Ld2kM2VuoIlt7zKqvb4ejlFoUPgzuqHhNtQrz...,.doubleclick.net,/,1763108000.0,67,True,True,False,,Medium,False,Secure,443,https://td.doubleclick.net/td/rul/862413881?ra...,nextiva.com,https://www.nextiva.com/integrations,
1,_vis_opt_s,1%7C,.nextiva.com,/,1737188000.0,14,False,False,False,,Medium,False,Secure,443,https://www.nextiva.com/wp/wp-content/themes/n...,nextiva.com,https://www.nextiva.com/integrations,
2,lidc,"""b=TGST03:s=T:r=T:a=T:p=T:g=3351:u=1:x=1:i=172...",.linkedin.com,/,1728635000.0,112,False,True,False,,Medium,False,Secure,443,https://px.ads.linkedin.com/wa/,nextiva.com,https://www.nextiva.com/integrations,


In [9]:
# prj = post-rejection
# Old way: consider all flows
# prj_sent_cookies = sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
# prj_br_cookies = prj_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()
# New way: consider cookies only (ignore the request url, which is not nused in get_comply cookie matching anyway)
prj_sent_cookies = sent_cookies[['name', 'domain', 'path', 'site']].drop_duplicates()

In [10]:
# print(f"Num captured sent cookies: {len(sent_cookies):,d}")
# print(f"Num unique captured cookies: {len(prj_sent_cookies):,d}")
# n_br_cookies = len(prj_br_cookies)
# print(f"Num unique browser cookies: {n_br_cookies:,d} on {prj_br_cookies.site.nunique():,d} websites") # and {sent_cookies.page_url.nunique():,d} pages")

## Analyze cookies compliance    

In [None]:
sites = set(cookie_prefs.site)
print(f"Num sent cookies in the cookie prefs sites: {len(prj_sent_cookies[prj_sent_cookies.site.isin(sites)]):,d}")

Num sent cookies in the cookie prefs sites: 49,018


In [12]:
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.comply_util'])
from consent.consistency.comply_util import get_comply

print("faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:")
all_complies = get_comply(cookie_prefs, prj_sent_cookies)
all_complies.head()

faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:


  1%|          | 7/1368 [00:00<00:46, 29.34it/s]

 88%|████████▊ | 1210/1368 [03:07<00:07, 21.64it/s]

Error fuzzy name match pref_name='mf_[0cc3b612-a97d-4bd0-b496-3ed0686d4603]' cookie_name='panoramaId' bad character range d-4 at position 20
Error fuzzy name match pref_name='mf_[0cc3b612-a97d-4bd0-b496-3ed0686d4603]' cookie_name='_cc_id' bad character range d-4 at position 20
Error fuzzy name match pref_name='mf_[0cc3b612-a97d-4bd0-b496-3ed0686d4603]' cookie_name='__eoi' bad character range d-4 at position 20
Error fuzzy name match pref_name='mf_[0cc3b612-a97d-4bd0-b496-3ed0686d4603]' cookie_name='pxcts' bad character range d-4 at position 20
Error fuzzy name match pref_name='mf_[0cc3b612-a97d-4bd0-b496-3ed0686d4603]' cookie_name='_px3' bad character range d-4 at position 20
Error fuzzy name match pref_name='mf_[0cc3b612-a97d-4bd0-b496-3ed0686d4603]' cookie_name='_ga_64H4Z6TSBF' bad character range d-4 at position 20
Error fuzzy name match pref_name='mf_[0cc3b612-a97d-4bd0-b496-3ed0686d4603]' cookie_name='_pxvid' bad character range d-4 at position 20
Error fuzzy name match pref_name=

100%|██████████| 1368/1368 [03:28<00:00,  6.55it/s]


Unnamed: 0,name,domain,path,site,comply
0,IDE,.doubleclick.net,/,nextiva.com,incorrect
1,_vis_opt_s,.nextiva.com,/,nextiva.com,incorrect
2,lidc,.linkedin.com,/,nextiva.com,incorrect
3,_an_uid,www.nextiva.com,/,nextiva.com,omit
4,_gcl_au,.nextiva.com,/,nextiva.com,incorrect


In [None]:
# Analyze complies here
complies = all_complies[['name', 'domain', 'path', 'site', 'comply']].drop_duplicates()
comply_sites_data = defaultdict(list)
for comply_type, comply_group in complies.groupby('comply'):
    n_sites = comply_group.site.nunique()
    comply_sites_data['comply_type'].append(comply_type)
    comply_sites_data['num_sites'].append(n_sites)
    comply_sites = pd.DataFrame(comply_sites_data).sort_values(by=['num_sites'], ascending=False)

nsites = all_complies.site.nunique()
comply_counts = complies.comply.value_counts()
comply_sites['num_sites_percent'] = comply_sites['num_sites'] / nsites * 100
comply_sites['num_cookies'] = comply_sites['comply_type'].map(comply_counts)
comply_sites['num_cookies_percent'] = comply_sites['num_cookies'] / comply_sites['num_cookies'].sum() * 100  # need to read scan_*.parquet for n_br_cookies, but may be unnecessary
comply_sites['num_cookies_per_site'] = comply_sites['num_cookies'] / comply_sites['num_sites']

# Do not count comply/correct enforcement: which require detecting all possible
comply_sites = comply_sites[comply_sites.comply_type != 'comply']
comply_sites

Unnamed: 0,comply_type,num_sites,num_sites_percent,num_cookies,num_cookies_percent,num_cookies_per_site
3,omit,1242,91.256429,22938,46.795055,18.468599
2,incorrect,1062,78.03086,14099,28.762903,13.275895
0,ambiguous,58,4.261572,69,0.140765,1.189655


In [None]:
# Record the analysis results
all_complies_file = SCAN_ROOT_DIR / f'all_complies{output_suffix}.parquet'
all_complies.to_parquet(all_complies_file); print(f"Written {len(all_complies):,d} records of all_complies to {all_complies_file}")

Written 49,018 records of all_complies to /mnt/sdd/ducbui/projects/data2/consent/2024-10-12/all_complies_0k_20k.parquet


In [15]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

raise StopExecution
import sys; sys.exit(0)

In [16]:
complies.comply

0        incorrect
1             omit
2             omit
3             omit
4        incorrect
           ...    
50516       comply
50517       comply
50518       comply
50519       comply
50520       comply
Name: comply, Length: 50521, dtype: object

In [17]:
nsites = all_complies.site.nunique(); nsites
# all_complies.site.unique()

1286

In [18]:
detected_contra_sites = [s for s, contras in site_to_contras.items() if len(contras) > 0]; len(detected_contra_sites)

0

In [19]:
complies = all_complies[ ['name', 'domain', 'site', 'comply',]].drop_duplicates()
comply_counts = complies.comply.value_counts()
comply_counts

omit         26549
incorrect    12978
comply       10703
ambiguous       93
Name: comply, dtype: int64

In [20]:
complies_sites = set(all_complies.site)
sent_cookies_com = sent_cookies[sent_cookies.site.isin(complies_sites)]
prj_sent_cookies_com = sent_cookies_com[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
prj_br_cookies_com = prj_sent_cookies_com[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()
print(f"Num captured sent cookies: {len(sent_cookies_com):,d}")
print(f"Num unique captured cookies: {len(prj_sent_cookies_com):,d}")

n_br_cookies_com = len(prj_br_cookies_com)
print(f"Num unique browser cookies: {n_br_cookies_com:,d} on {prj_br_cookies_com.site.nunique():,d} websites") # and {sent_cookies.page_url.nunique():,d} pages")

Num captured sent cookies: 5,485,046
Num unique captured cookies: 3,341,107
Num unique browser cookies: 133,395 on 1,286 websites


# Analyze contras

In [21]:
# Way 1: compute contra sites by dynamic analysis: this should be lower than statically analyzing prefs
# because we cannot check all combinations of consent modes.
from consent.consistency.comply_util import get_appr_rej_sets
contra_data = []
for asite in cookie_prefs.site.unique():
    site_prefs = cookie_prefs[cookie_prefs.site == asite]
    _, _, contras = get_appr_rej_sets(site_prefs)
    for contra in contras:
        contra['site'] = asite
        contra_data.append(contra)
contra_sites = pd.DataFrame(contra_data)
contra_sites.head()

Unnamed: 0,domain,name,site
0,www.menshealth.com,location_data,menshealth.com
1,www.elle.com,location_data,elle.com
2,www.hottopic.com,__cqact,hottopic.com
3,aljazeera.com,_dlt,aljazeera.com
4,nr-data.net,JSESSIONID,liverpoolfc.com


In [22]:
assert set(detected_contra_sites) == set(contra_sites.site), "this is the same method with above, so the result"

AssertionError: this is the same method with above, so the result

In [None]:
set(detected_contra_sites) - set(contra_sites.site)
set(contra_sites.site) - set(detected_contra_sites)
# This may be the sites excluded due to testing cookies in the preferences.

{'247sports.com',
 'abbvie.com',
 'acquia.com',
 'adobe.io',
 'adweek.com',
 'aljazeera.com',
 'allaboutvision.com',
 'allegiantair.com',
 'allergan.com',
 'alteryx.com',
 'americanbar.org',
 'aon.com',
 'ariel.co.uk',
 'asos.com',
 'autosport.com',
 'avalara.com',
 'avast.com',
 'avastbrowser.com',
 'barnesandnoble.com',
 'bazaarvoice.com',
 'bbva.com',
 'bd.com',
 'behance.net',
 'belkin.com',
 'betsson.com',
 'bitdefender.com',
 'bitdefender.net',
 'bkstr.com',
 'blackboard.com',
 'bplans.com',
 'brightside.me',
 'bromium-online.com',
 'callofduty.com',
 'cambridge.org',
 'carbonite.com',
 'cheapoair.com',
 'chrysler.com',
 'cisco.com',
 'citationmachine.net',
 'clickmeeting.com',
 'clicktale.net',
 'cloudbees.com',
 'cnet.com',
 'colt.net',
 'commonsensemedia.org',
 'contentsquare.com',
 'conviva.com',
 'creighton.edu',
 'curalate.com',
 'dc.com',
 'dccomics.com',
 'discogs.com',
 'dynamicyield.com',
 'easybib.com',
 'eidos.com',
 'elderscrollsonline.com',
 'evri.com',
 'ew.com',
 

In [None]:
# Way 2: compute contra sites by DataFrame group by
contra_cookies_dfs = []
for _, same_cookies in cookie_prefs.groupby(['name', 'domain', 'site']):
    consent_modes = same_cookies.consent_mode.unique()
    if len(consent_modes) >= 2 and same_cookies.category.nunique() > 1: # and 'always active' in consent_modes:
        contra_cookies_dfs.append(same_cookies)
contra_cookies = pd.concat(contra_cookies_dfs).drop_duplicates()

In [None]:
print("Number sites with contra cookies:", contra_cookies.site.nunique())

Number sites with contra cookies: 184


In [None]:
# set(contra_cookies.site) - set(contra_sites.site)

In [None]:
print("Num contra cookie settings (double categories):", len(contra_cookies), f"settings  on {contra_cookies.site.nunique()} sites")
print("Contras with 1 always-active:", contra_sites.site.nunique())
print("Contras with both editable:", len(set(contra_cookies.site) - set(contra_sites.site)))

Num contra cookie settings (double categories): 648 settings  on 184 sites
Contras with 1 always-active: 192
Contras with both editable: 0


In [None]:
contra_cookies[contra_cookies.site == 'elsevier.nl']

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent


In [None]:
# assert len(detected_contra_sites) <= contra_sites.site.nunique() < len(detected_contra_sites) + 2

In [None]:
complies[complies.comply == 'ambiguous']

Unnamed: 0,name,domain,site,comply
173,TS013559a7,www.mastercard.us,mastercard.com,ambiguous
1236,_qubitTracker,.www.very.co.uk,very.co.uk,ambiguous
2365,G_ENABLED_IDPS,.www.glassdoor.com,glassdoor.com,ambiguous
2372,trs,www.glassdoor.com,glassdoor.com,ambiguous
2375,_gid,.glassdoor.com,glassdoor.com,ambiguous
...,...,...,...,...
59468,ak_bmsc,.mulesoft.com,mulesoft.com,ambiguous
59481,JSESSIONID,training.mulesoft.com,mulesoft.com,ambiguous
60397,TLTSID,.bkstr.com,bkstr.com,ambiguous
60412,dtSa,.bkstr.com,bkstr.com,ambiguous


In [None]:
cookie_prefs[ (cookie_prefs.site == 'michaels.com') & (cookie_prefs.name == 'utag_main') ]

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
88,utag_main,.michaels.com,365 days,C0001,Strictly Necessary Cookies,always active,michaels.com,onetrust,en.json,True
148,utag_main,michaels.com,365 days,C0002,Performance Cookies,active,michaels.com,onetrust,en.json,False


In [None]:
cookie_prefs[ (cookie_prefs.site == 'statista.com') & (cookie_prefs.name == 'id') ]

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
74,id,m6r.eu,90 days,C0001,Necessary Cookies,always active,statista.com,onetrust,en.json,True
131,id,m6r.eu,0 days,C0004,Targeting Cookies,inactive,statista.com,onetrust,en.json,False


In [None]:
# contra_cookies.name.value_counts()[:10]
# contra_cookies[contra_cookies.name == '__cf_bm'].reset_index(drop=True) # .site.nunique()

# Analyze ambiguous flows

In [None]:
ambi_cookies = complies[complies.comply == 'ambiguous']
print("Num ambiguous cookies:", len(ambi_cookies), "on num sites:", ambi_cookies.site.nunique())
assert len(ambi_cookies) == len(ambi_cookies.drop_duplicates()), "there is some duplicate cookies"


## Analyze no-way-to-opt-out

In [None]:
# Way 1: compute contra sites by DataFrame group by
nwtoo_cookies_dfs = []
for _, site_cookies in cookie_prefs.groupby(['site']):
    consent_modes = site_cookies.consent_mode.unique().tolist()
    if len(consent_modes) == 1 and consent_modes[0] == 'always active':
        nwtoo_cookies_dfs.append(site_cookies)
nwtoo_cookies = pd.concat(nwtoo_cookies_dfs).drop_duplicates()
nwtoo_cookies.head()

In [None]:
nwtoo_cookies.site.nunique()

# Analyze inconsistent flows/cookies

In [None]:
def plot_inconsis_types():
    sns.set(rc={'figure.figsize':(9,9),"font.size":30,"axes.titlesize":30,"axes.labelsize":30},style="white", font_scale = 2.5)

    ax = sns.barplot(x='comply_type', y='num_sites_percent', data=noncomply_sites)
    ax.tick_params(left=True, bottom=True, direction="in")
    ax.set_ylabel('# Websites (%)')
    ax.set_xlabel('Noncompliance Types')
    xticks = [x.get_text().title() for x in ax.get_xticklabels()]
    ax.set(xticklabels=xticks)

    out_file = FIG_DIR / 'noncomply_types_per_site.png'
    # plt.savefig(out_file, dpi=200, bbox_inches='tight'); print(f"Written to {out_file}")

In [None]:
## Plot distribution of omitted cookie declarations

In [None]:
omit_complies = complies[(complies.comply == 'omit')] #  & ~complies.name.isin(['OptanonAlertBoxClosed', 'OptanonConsent']) ]
omit_complies.site.nunique()

In [None]:
omit_complies.site.value_counts()[:7]

In [None]:
omit_cookies = omit_complies.name.value_counts()[:5].to_frame().reset_index().rename(columns={'index': 'cookie_name'})
omit_cookies['num_sites'] = omit_cookies['cookie_name'].map(lambda cookie_name: omit_complies[omit_complies.name == cookie_name].site.nunique())
print(omit_cookies.head(5)[['cookie_name', 'num_sites']].rename(columns={'cookie_name': 'Cookie Name', 'num_sites': '# Websites'}).to_latex(index=False))

In [None]:
# set(cookielist_sites) - set(omit_complies.site.unique())
omit_counts = omit_complies.site.value_counts().to_frame().rename(columns={'site': 'num_omit_cookies'})
print("Num omitted sites:", omit_complies.site.nunique())
omit_counts.num_omit_cookies.mean(), omit_counts.num_omit_cookies.std(), omit_counts.num_omit_cookies.min(), omit_counts.num_omit_cookies.max()

In [None]:
sns.set(rc={'figure.figsize':(12,8),"font.size":35,"axes.titlesize":30,"axes.labelsize":50},style="white", font_scale = 4.5)
sns.set_style("ticks", {"xtick.direction": "in", "ytick.direction": "in"})
plt.rcParams['xtick.major.size'] = 15
plt.rcParams['ytick.major.size'] = 15

ax = sns.ecdfplot(data=omit_counts, x=omit_counts.num_omit_cookies, linewidth=8)
ax.set(xlim=(0, 350))

plt.xlabel('# Omitted Cookies/Website')
plt.ylabel('CDF')
# out_file = FIG_DIR / 'eu_omit_cookie_cdf.png'
# plt.savefig(out_file, dpi=200, bbox_inches='tight'); print(f"Written to {out_file}")

In [None]:
plot_df = omit_counts[:50]

sns.set(rc={'figure.figsize':(24,8),"font.size":35,"axes.titlesize":35,"axes.labelsize":25},style="white", font_scale=1.5)
ax = sns.barplot(x=plot_df.index, y=plot_df.num_omit_cookies)
ticks = plt.xticks(rotation=45, ha='right')

ax.tick_params(left=True, bottom=False, direction="in")
ax.set_ylabel('# Omitted Cookies')
ax.set_xlabel('Website')

# out_file = FIG_DIR / 'omit_site.png'
# plt.savefig(out_file, dpi=200, bbox_inches='tight'); print(f"Written to {out_file}")

In [None]:
def plot_histogram_omit():
    sns.set(rc={'figure.figsize':(9,6),"font.size":25,"axes.titlesize":25,"axes.labelsize":15},style="white", font_scale = 1)
    omit_counts_histo = omit_complies.site.value_counts().value_counts()
    sns.barplot(x=omit_counts_histo.index, y=omit_counts_histo.values)
    ticks = plt.xticks(rotation=45, ha='right')

## Plot distribution of incorrectly enforced cookies

In [None]:
incor_complies = complies[complies.comply == 'incorrect']
# set(cookielist_sites) - set(omit_complies.site.unique())
incor_counts = incor_complies.site.value_counts().iloc[:50]
incor_counts = incor_counts.to_frame().reset_index().sort_values(by=['site', 'index'], ascending=[False, True])
incor_counts.columns

In [None]:
sns.set(rc={'figure.figsize':(24,8),"font.size":35,"axes.titlesize":35,"axes.labelsize":25},style="white", font_scale=1.5)
ax = sns.barplot(x='index', y='site', data=incor_counts)

ticks = plt.xticks(rotation=45, ha='right')

ax.tick_params(left=True, bottom=True, direction="in")
ax.set_ylabel('# Incorrectly Enforced Cookies')
ax.set_xlabel('Website')

# out_file = FIG_DIR / 'incor_site.png'
# plt.savefig(out_file, dpi=200, bbox_inches='tight'); print(f"Written to {out_file}")

In [None]:
# len(incor_counts)

In [None]:
incor_counts[incor_counts.site == 33].reset_index(drop=True)

### Top incorrect cookies

In [None]:
incor_cookies = complies[complies.comply == 'incorrect']

In [None]:
# site = 'onetrust.com'
# complies[complies.site == site].reset_index(drop=True)
# cookie_prefs[(cookie_prefs.site == site) & (cookie_prefs.name == '_gid')]

In [None]:
top_incor_cookies = incor_cookies.name.value_counts().sort_values(ascending=False).to_frame().reset_index().rename(columns={'index':'cookie_name', 'name': 'num_occur'})

In [None]:
top_incor_cookies['num_sites'] = top_incor_cookies.cookie_name.map(lambda cookie_name: incor_cookies[incor_cookies.name == cookie_name].site.nunique())

In [None]:
print(top_incor_cookies.head(5)[['cookie_name', 'num_sites']].rename(columns={'cookie_name': 'Cookie Name', 'num_sites': '# Websites'}).to_latex(index=False))

# Narrow cookie consent preferences

In [None]:
consent_cookies = sent_cookies[sent_cookies.name.isin(['OptanonConsent', 'CookieConsent'])][['domain', 'name', 'site']].drop_duplicates()
narrow_consent_cookies = consent_cookies[~consent_cookies.domain.str.startswith('.')].reset_index(drop=True)

In [None]:
set(contra_cookies.site).intersection(set(narrow_consent_cookies.site))

In [None]:
narrow_consent_cookies

# Case studies

In [None]:
incor_df = complies[complies.comply == 'incorrect']
incor_df.site.value_counts().sort_values(ascending=False).head()

In [None]:
case_site = 'blabbermouth.net'
incor_df[incor_df.site == case_site]

In [None]:
# cookie_prefs[(cookie_prefs.site == 'scientificamerican.com') & (cookie_prefs.name == 'MXP_TRACKINGID')]
cookie_prefs[(cookie_prefs.site == case_site) & (cookie_prefs.name == 'IDE')]

In [None]:
# sent_cookies[(sent_cookies.site == 'scientificamerican.com') & (sent_cookies.name == 'MXP_TRACKINGID')]

In [None]:
omit_df = complies[complies.comply == 'omit']
omit_df.site.value_counts().sort_values(ascending=False).iloc[20:30]

In [None]:
print(omit_df[omit_df.site == 'scientificamerican.com'].reset_index().name.tolist())

In [None]:
'uids' in omit_df[omit_df.site == 'scientificamerican.com'].reset_index().name.tolist()

In [None]:
omit_df[(omit_df.site == 'scientificamerican.com') & (omit_df.name == 'cftoken')]

In [None]:
cookie_prefs[(cookie_prefs.site == 'scientificamerican.com') & (cookie_prefs.name == 'ADGRX_UID')]

In [None]:
all_sites = set(complies.site)
nothing_comply_sites = all_sites - set(complies[complies.comply == 'comply'].site)
print("Sites with no compliant cookies:", nothing_comply_sites)

In [None]:
for site in ['liveperson.com']: # nothing_comply_sites:
    print(site)
site_cookies = postrej_br_cookies[postrej_br_cookies.site == site]

In [None]:
acookie = site_cookies[site_cookies.name == 'OptanonAlertBoxClosed'].iloc[0]

In [None]:
site_prefs = cookie_prefs[cookie_prefs.site == site]
consent_cookies = site_prefs[site_prefs.consent]
consent_cookie_names = consent_cookies.name
site_cookies[site_cookies.name.isin(consent_cookie_names)]
# cookie_name = 'OptanonAlertBoxClosed'
# cookie_pref = site_prefs[site_prefs.name == cookie_name].iloc[0]
# cookie_pref

In [None]:
consent_cookies

In [None]:
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match
cookie_pref_match(acookie, cookie_pref, site, verbose=2)

In [None]:
# Analyze Contras