In [1]:
from collections import defaultdict, Counter
from multiprocessing import Pool
from pathlib import Path
from typing import Dict
import json
import re

from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from consent.consistency.util import SCAN_DIRS, FIG_DIR, SCAN_ROOT_DIR
from consent.data.site_pref import read_site_prefs
from consent.data.pref_menu_scan.prerej_cookie_reader import read_prerej_sent_cookies_in_scans
from consent.cmp.consentlib.onetrust import OptanonConsentCookie
from consent.cmp.consentlib.cookiebot import CookiebotConsentCookie
from consent.util.default_path import get_data_dir
from ooutil.cookie_util import get_brower_cookies
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain


In [2]:
prej_sent_cookies = read_prerej_sent_cookies_in_scans(SCAN_DIRS)
prej_sent_cookies

Num prerej cookie files: 1366
Error loading cookie for site tevapharm.com/prerej_cookies.json: Expecting value: line 1 column 1 (char 0)
Error loading cookie for site rockwellautomation.com/prerej_cookies.json: Expecting value: line 1 column 1 (char 0)
Error loading cookie for site jimmychoo.com/prerej_cookies.json: Extra data: line 531 column 2 (char 74606)
Error loading cookie for site fortinet.com/prerej_cookies.json: 'utf-8' codec can't decode byte 0xac in position 106069: invalid start byte
Error loading cookie for site exoclick.com/prerej_cookies.json: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
Error loading cookie for site returnpath.com/prerej_cookies.json: Extra data: line 409 column 2 (char 26396)
Error loading cookie for site asda.com/prerej_cookies.json: 'utf-8' codec can't decode byte 0xc2 in position 16654: invalid continuation byte
Error loading cookie for site nasdaq.com/prerej_cookies.json: Extra data: line 641 column 2 (char 52420)
95 sites

Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time,page_url
0,.accor.com,1.644379e+09,False,OptanonConsent,/,Lax,False,isGpcEnabled=0&datestamp=Fri+Aug+13+2021+04%3A...,https://edg.accor.com/cdx/r20.gif?rnd=1-1-1009...,sofitel.com,1.628827e+09,1.628827e+09,https://sofitel.accor.com/northamerica/index.e...
1,.accor.com,-1.000000e+00,False,incap_ses_8072_2545443,/,,False,ZsRoKdzUIyNdziAdf4EFcNbuFWEAAAAAosZBwXR4jO7FFT...,https://cdn.accor.com/cdx/r20.gif?rnd=1-1-1009...,sofitel.com,1.628827e+09,1.628827e+09,https://sofitel.accor.com/northamerica/index.e...
2,.accor.com,1.660290e+09,True,visid_incap_2545443,/,,False,sMAsu37JTTK9mqn+KtdNOtbuFWEAAAAAQUIPAAAAAADVdq...,https://all.accor.com/rb_80be963f-a859-4808-9b...,sofitel.com,1.628827e+09,1.628827e+09,https://sofitel.accor.com/northamerica/index.e...
3,.accor.com,-1.000000e+00,False,CSESSIONID,/,,True,HZ5AD83FBEC4B14A4D81EB6580528231CF,https://cdn.accor.com/cdx/r20.gif?rnd=1-1-1009...,sofitel.com,1.628827e+09,1.628827e+09,https://sofitel.accor.com/northamerica/index.e...
4,.accor.com,-1.000000e+00,False,CSESSIONID,/,,True,HZ5AD83FBEC4B14A4D81EB6580528231CF,https://cdn.accor.com/cdx/r20-100KB.png?rnd=14...,sofitel.com,1.628827e+09,1.628827e+09,https://sofitel.accor.com/northamerica/index.e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
266035,.leica-geosystems.com,1.636750e+09,False,_gcl_au,/,,False,1.1.1689705220.1628974156,https://leica-geosystems.com/Themes/Leica/Leic...,leica-geosystems.com,1.628974e+09,1.628974e+09,https://leica-geosystems.com/en-us
266036,leica-geosystems.com,-1.000000e+00,True,ASP.NET_SessionId,/,Lax,False,rfhjajhbw0r1vc54qk5ioryx,https://leica-geosystems.com/Themes/Leica/Leic...,leica-geosystems.com,1.628974e+09,1.628974e+09,https://leica-geosystems.com/en-us
266037,leica-geosystems.com,-1.000000e+00,False,hexagonleica#lang,/,,False,en,https://leica-geosystems.com/-/media/images/gl...,leica-geosystems.com,1.628974e+09,1.628974e+09,https://leica-geosystems.com/en-us
266038,leica-geosystems.com,-1.000000e+00,False,TS01127c47,/,,False,014edcb93343ab3ec57ca1f8d8297819314408c0138658...,https://leica-geosystems.com/Coveo/Hive/js/Cov...,leica-geosystems.com,1.628974e+09,1.628974e+09,https://leica-geosystems.com/en-us


In [3]:
pre_consent_cookies = prej_sent_cookies[prej_sent_cookies.name.isin(['OptanonConsent', 'CookieConsent'])][['domain', 'name', 'value', 'site']].drop_duplicates()
pre_consent_cookies.name.value_counts()

OptanonConsent    675
CookieConsent       9
Name: name, dtype: int64

In [4]:
pre_consent_cookies.head()

Unnamed: 0,domain,name,value,site
0,.accor.com,OptanonConsent,isGpcEnabled=0&datestamp=Fri+Aug+13+2021+04%3A...,sofitel.com
1316,.acquia.com,OptanonConsent,isGpcEnabled=0&datestamp=Thu+Aug+19+2021+10%3A...,agilone.com
2143,.jackinthebox.com,OptanonConsent,isIABGlobal=false&datestamp=Thu+Aug+19+2021+10...,jackinthebox.com
2287,.www.conservation.org,OptanonConsent,isIABGlobal=false&datestamp=Fri+Aug+13+2021+03...,conservation.org
2435,.dialpad.com,OptanonConsent,isIABGlobal=false&datestamp=Thu+Aug+19+2021+14...,uberconference.com


In [5]:
len(pre_consent_cookies)

684

In [6]:
def get_consent(row):
    if row['name'] == 'OptanonConsent':
        cookie_cls = OptanonConsentCookie
    else:
        cookie_cls = CookiebotConsentCookie
    try:
        cookie = cookie_cls(row.to_dict())
        return cookie.get_cat_to_pref()
    except Exception:
        pass
    return None
pre_consent_cookies['consent'] = pre_consent_cookies.apply(get_consent, axis=1)

In [9]:
pre_cookies = pre_consent_cookies[~pre_consent_cookies.consent.isna()]
assert len(pre_cookies) == pre_cookies.site.nunique(), 'some website has 2 consent cookies?'
print("Number of websites with consent cookies set before submission", pre_cookies.site.nunique())

Number of websites with consent cookies set before submission 672


In [None]:
# TODO: improve detection by implementing below

In [41]:
pre_cookies

Unnamed: 0,domain,name,value,site,consent
1,.ratebeer.com,OptanonConsent,isIABGlobal=false&datestamp=Sat+Aug+14+2021+20...,ratebeer.com,"{'1': True, '3': True, 'SPD_BG': False, '4': F..."
55,.adobe.com,OptanonConsent,isIABGlobal=false&datestamp=Fri+Aug+13+2021+13...,adobe.ly,"{'C0001': True, 'C0002': True, 'C0003': True, ..."
390,.bethesda.net,OptanonConsent,isIABGlobal=false&datestamp=Fri+Aug+13+2021+12...,bethsoft.com,"{'C0001': True, 'BG54': True, 'C0003': True, '..."
512,.intrepidtravel.com,OptanonConsent,isIABGlobal=false&datestamp=Fri+Aug+13+2021+13...,intrepidtravel.com,"{'C0003': True, 'C0004': True, 'C0001': True, ..."
629,.perimeterx.com,OptanonConsent,isIABGlobal=false&datestamp=Thu+Aug+12+2021+23...,perimeterx.net,"{'C0004': True, 'C0003': True, 'BG4': True, 'C..."
...,...,...,...,...,...
182319,.ajpmonline.org,OptanonConsent,isIABGlobal=false&datestamp=Fri+Aug+13+2021+03...,ajpmonline.org,"{'1': True, '2': True, '3': True, '4': False}"
182491,.saatchiart.com,OptanonConsent,isIABGlobal=false&datestamp=Thu+Aug+12+2021+23...,saatchiart.com,"{'C0001': True, 'C0003': True, 'SPD_BG': True,..."
182508,.www.visitnorway.com,OptanonConsent,isIABGlobal=false&datestamp=Fri+Aug+13+2021+04...,visitnorway.com,"{'C0001': True, 'C0003': False, 'C0002': False..."
182859,.webengage.com,OptanonConsent,isIABGlobal=false&datestamp=Fri+Aug+13+2021+03...,webengage.com,"{'C0001': True, 'C0002': True, 'C0003': True, ..."


In [38]:
# pre_consent_cookies[~pre_consent_cookies.domain.str.startswith('.')]

In [2]:
scanned_sites = [site_dir.name for site_dir in data_dir.glob('*') if site_dir.is_dir()]
site_dirs = [data_dir / site for site in scanned_sites]
print("Number of scanned sites:", len(scanned_sites)) # , scanned_sites)
site_prefs = read_site_prefs(site_dirs)
site_prefs

Number of scanned sites: 227
Contain duplicates: Before 12937, after drop duplicates: 12927


Unnamed: 0,duration,host,name,group_id,site,consent
0,Session,healthgrades.com,_vapi,C0001,healthgrades.com,True
1,365 days,partners.healthgrades.com,OptanonConsent,C0001,healthgrades.com,True
2,a few seconds,healthgrades.com,_dc_gtm_UA-xxxxxxxx,C0001,healthgrades.com,True
3,2914169 days,partners.healthgrades.com,eupubconsent,C0001,healthgrades.com,True
4,365 days,healthgrades.com,OptanonAlertBoxClosed,C0001,healthgrades.com,True
...,...,...,...,...,...,...
12932,390 days,yieldoptimizer.com,ph,SPD_BG,bestwestern.com,False
12933,390 days,yieldoptimizer.com,dph,SPD_BG,bestwestern.com,False
12934,390 days,yieldoptimizer.com,gcma,SPD_BG,bestwestern.com,False
12935,390 days,yieldoptimizer.com,ckid,SPD_BG,bestwestern.com,False


In [3]:
cookielist_sites = sorted(site_prefs.site.unique().tolist())
n_cookielist_sites = len(cookielist_sites)
assert site_prefs.site.nunique() == n_cookielist_sites
print("Sites with cookie list:", n_cookielist_sites, cookielist_sites)

Sites with cookie list: 83 ['accorhotels.com', 'adtelligent.com', 'apachefriends.org', 'behance.net', 'bestwestern.com', 'biomedcentral.com', 'bitnami.com', 'bodybuilding.com', 'britishcouncil.org', 'callofduty.com', 'canon.com', 'cell.com', 'chanel.com', 'christies.com', 'cloudflare.com', 'cnet.com', 'commonsensemedia.org', 'corel.com', 'digicert.com', 'digitalspy.com', 'documentforce.com', 'download.com', 'elsevier.com', 'elsevierhealth.com', 'exacttarget.com', 'exoclick.com', 'fairmont.com', 'fendi.com', 'force.com', 'gamespot.com', 'glassdoor.co.in', 'glassdoor.com', 'goal.com', 'healthgrades.com', 'home.kpmg', 'hotjar.com', 'irishtimes.com', 'kpmg.com', 'magento.com', 'mango.com', 'marketo.com', 'mcdonalds.com', 'media.net', 'mendeley.com', 'metacritic.com', 'myportfolio.com', 'nature.com', 'netflix.com', 'news24.com', 'nflxvideo.net', 'oclc.org', 'onetrust.com', 'pardot.com', 'payoneer.com', 'pendo.io', 'rapidssl.com', 'redbull.com', 'rte.ie', 'salesforce.com', 'scientificamerica

In [4]:
raw_prerej_sent_cookies_file = data_dir / 'raw_prerej_sent_cookies.parquet'
if raw_prerej_sent_cookies_file.exists():
    raw_prerej_sent_cookies = pd.read_parquet(raw_prerej_sent_cookies_file)
else:
    raw_postrej_sent_cookies = parallel_read_prerej_sent_cookies(data_dir)
    raw_postrej_sent_cookies.to_parquet(raw_prerej_sent_cookies_file); print(f"Written to {raw_prerej_sent_cookies_file}")

raw_prerej_sent_cookies.head(3)

Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time
0,.siemens.com,1651627000.0,False,OptanonConsent,/,,True,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,https://www.siemens.com/js/chunk-4d7832bc.7be6...,siemens.com,1620091000.0,1620091000.0
1,.siemens.com,1651627000.0,False,OptanonConsent,/,,True,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,https://www.siemens.com/img/arrow_right_black....,siemens.com,1620091000.0,1620091000.0
2,.siemens.com,1651627000.0,False,OptanonConsent,/,,True,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,https://www.siemens.com/locales/en-newhome.json,siemens.com,1620091000.0,1620091000.0


In [5]:
prerej_sent_cookies = raw_prerej_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
prerej_br_cookies = prerej_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site', 'value']].drop_duplicates()

In [6]:
print(f"Num raw captured cookies: {len(raw_prerej_sent_cookies)}")
print(f"Num unique captured cookies: {len(prerej_sent_cookies)}")
print(f"Num unique browser cookies: {len(prerej_br_cookies)}")

Num raw captured cookies: 44861
Num unique captured cookies: 44861
Num unique browser cookies: 6707


## Analyze cookies compliance    

In [7]:
optanons = prerej_br_cookies[prerej_br_cookies.name == 'OptanonConsent'].copy()

In [8]:
def get_cookie_consent_groups(consent_cookie_val):
    try:
        return OptanonConsentCookie(consent_cookie_val).get_group_consent()
    except Exception as e:
        print(f"Error decoding {consent_cookie_val} {e}")
    return {}
optanons['consent_groups'] = optanons.value.map(get_cookie_consent_groups)
optanons

Error decoding isIABGlobal=false&datestamp=Mon+May+03+2021+21%3A28%3A04+GMT-0400+(Eastern+Daylight+Time)&version=6.7.0&hosts=&consentId=aaadbed7-a6e8-4561-8e50-4d50b53ac35b&interactionCount=0&landingPath=https%3A%2F%2Fwww.icims.com%2F argument of type 'NoneType' is not iterable


Unnamed: 0,domain,expires,name,path,sameSite,secure,site,value,consent_groups
0,.siemens.com,1.651627e+09,OptanonConsent,/,,True,siemens.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0002': False, 'C0003': False..."
404,.bonappetit.com,1.651627e+09,OptanonConsent,/,,True,bonappetit.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0005': True, 'C0003': True, ..."
609,.callofduty.com,1.651628e+09,OptanonConsent,/,Lax,False,callofduty.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'1': True, '2': True, '3': True, '4': True}"
1069,.www.akamai.com,1.651628e+09,OptanonConsent,/,Lax,False,akamai.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0002': True, 'C0003': True, ..."
1479,.britishcouncil.org,1.651628e+09,OptanonConsent,/,Lax,False,britishcouncil.org,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0002': True, 'C0003': False,..."
...,...,...,...,...,...,...,...,...,...
43151,.self.com,1.651628e+09,OptanonConsent,/,,True,self.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0005': True, 'C0003': True, ..."
43224,.bodybuilding.com,1.651628e+09,OptanonConsent,/,Lax,False,bodybuilding.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0003': False, 'C0002': False..."
43765,.www.instructure.com,1.651628e+09,OptanonConsent,/,Lax,False,instructure.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0002': True, 'C0004': True, 'C0003': True, ..."
44023,.broadcom.com,1.651628e+09,OptanonConsent,/,Lax,False,broadcom.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'1': True, '3': True, '2': True, '4': True}"


In [9]:
prestore_sites = set(optanons.site.unique())
assert len(optanons) == len(prestore_sites), "There is some sites with duplicate OptanonConsent cookies"
print(f"Num sites with pre-stored pref cookie {len(prestore_sites)}")

Num sites with pre-stored pref cookie 95


In [10]:
def read_scan_result(site_dir):
    site_name = site_dir.name
    scan_file = site_dir / 'scan.json'
    if not scan_file.exists():
        return {}
    return json.loads(scan_file.read_text())

def read_scan_results(site_dirs):
    scans = []
    for site_dir in site_dirs:
        scans.append(read_scan_result(site_dir))
    return pd.DataFrame(scans)

raw_scans = read_scan_results(site_dirs)
scans = raw_scans[ ~raw_scans.failed.isna() & ~raw_scans['category'].isna() ]
scans = scans[ scans.failed == False ]
scans = scans.explode('category').reset_index(drop=True)
print("Number of domains:", scans.domain.nunique())
scans = scans.join(pd.DataFrame(scans['category'].tolist()))
scans

Number of domains: 206


Unnamed: 0,domain,failed,failed_reason,pref_menu_type,category,id,name,prev_status,cur_status
0,siemens.com,False,,accordion,"{'id': 'C0001', 'name': 'Strictly necessary co...",C0001,Strictly necessary cookies,always_active,always_active
1,siemens.com,False,,accordion,"{'id': 'C0002', 'name': 'Performance cookies',...",C0002,Performance cookies,false,false
2,siemens.com,False,,accordion,"{'id': 'C0003', 'name': 'Functionality cookies...",C0003,Functionality cookies,false,false
3,siemens.com,False,,accordion,"{'id': 'C0004', 'name': 'Marketing cookies', '...",C0004,Marketing cookies,false,false
4,healthgrades.com,False,,tab,"{'id': 'C0001', 'name': 'Strictly Necessary Co...",C0001,Strictly Necessary Cookies,unavailable,unavailable
...,...,...,...,...,...,...,...,...,...
790,bestwestern.com,False,,tab,"{'id': 'SPD_BG', 'name': 'Other Third Party Co...",SPD_BG,Other Third Party Cookies,true,false
791,sendinblue.com,False,,tab,"{'id': 'C0001', 'name': 'Strictly Necessary Co...",C0001,Strictly Necessary Cookies,unavailable,unavailable
792,sendinblue.com,False,,tab,"{'id': 'C0003', 'name': 'Functional Cookies', ...",C0003,Functional Cookies,true,false
793,sendinblue.com,False,,tab,"{'id': 'C0002', 'name': 'Performance Cookies',...",C0002,Performance Cookies,true,false


In [11]:
# scans[scans.domain == 'eonline.com']

In [12]:
no_choice_domains = []
for domain, group in scans.groupby('domain'):
    nvalues = group['prev_status'].nunique()
    if nvalues == 1 and group['prev_status'].unique()[0] in ['unavailable', 'always_active']: # , f"{group['prev_status'].unique()=} {domain=}"
        no_choice_domains.append(domain)
n_no_choices = len(no_choice_domains)
domains = set(scans.domain.unique())
n_domains = len(domains)
print(f"Number extracted domains: {n_domains}")
print(f"Number of no choice domains: {n_no_choices} ({n_no_choices / n_domains * 100:.2f}%)")

Number extracted domains: 206
Number of no choice domains: 48 (23.30%)


In [13]:
# assert prestore_sites.issubset(domains)
sites_with_prestore = prestore_sites.intersection(domains)
print(f"Num sites with prestore cookies:", len(sites_with_prestore))

Num sites with prestore cookies: 92


In [14]:
# find domains prestore all accept:
def are_all_accept(consent_groups):
    return all(val == True for val in consent_groups.values())
optanons['all_accept'] = optanons.consent_groups.map(are_all_accept)
optanons

Unnamed: 0,domain,expires,name,path,sameSite,secure,site,value,consent_groups,all_accept
0,.siemens.com,1.651627e+09,OptanonConsent,/,,True,siemens.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0002': False, 'C0003': False...",False
404,.bonappetit.com,1.651627e+09,OptanonConsent,/,,True,bonappetit.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0005': True, 'C0003': True, ...",True
609,.callofduty.com,1.651628e+09,OptanonConsent,/,Lax,False,callofduty.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'1': True, '2': True, '3': True, '4': True}",True
1069,.www.akamai.com,1.651628e+09,OptanonConsent,/,Lax,False,akamai.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0002': True, 'C0003': True, ...",True
1479,.britishcouncil.org,1.651628e+09,OptanonConsent,/,Lax,False,britishcouncil.org,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0002': True, 'C0003': False,...",False
...,...,...,...,...,...,...,...,...,...,...
43151,.self.com,1.651628e+09,OptanonConsent,/,,True,self.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0005': True, 'C0003': True, ...",True
43224,.bodybuilding.com,1.651628e+09,OptanonConsent,/,Lax,False,bodybuilding.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0001': True, 'C0003': False, 'C0002': False...",False
43765,.www.instructure.com,1.651628e+09,OptanonConsent,/,Lax,False,instructure.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'C0002': True, 'C0004': True, 'C0003': True, ...",True
44023,.broadcom.com,1.651628e+09,OptanonConsent,/,Lax,False,broadcom.com,isIABGlobal=false&datestamp=Mon+May+03+2021+21...,"{'1': True, '3': True, '2': True, '4': True}",True


In [15]:
all_accept_domains = set(optanons[optanons.all_accept].site.unique())
print("Num all accept domains:", len(all_accept_domains))

Num all accept domains: 77


In [16]:
choice_domains = set(scans.domain.unique()) - set(no_choice_domains)
print(f"Num domains with choices: {len(choice_domains)}")

sites_with_choice_preaccept = choice_domains.intersection(all_accept_domains)
print(f"Num domains with choices but prestore all accept: {len(sites_with_choice_preaccept)}")

Num domains with choices: 158
Num domains with choices but prestore all accept: 53
