In [1]:
from collections import defaultdict

from tqdm import tqdm
import pandas as pd
import seaborn as sns

from consent.consistency.util import get_scan_root_dir, get_scan_dirs # , FIG_DIR uncomment to save fig.
from ooutil.df_util import read_data_files

SCAN_DIRS = get_scan_dirs('eu')
SCAN_ROOT_DIR = get_scan_root_dir('eu')

In [2]:
cookie_prefs_files = list(SCAN_ROOT_DIR.glob('cookie_prefs_*.parquet'))
print(sorted(f.name for f in cookie_prefs_files))
cookie_prefs = read_data_files(cookie_prefs_files) # pd.read_parquet(cookie_prefs_file)
cookie_prefs.head()

['cookie_prefs_0k_20k.parquet', 'cookie_prefs_100k_200k.parquet', 'cookie_prefs_20k_40k.parquet', 'cookie_prefs_40k_60k.parquet', 'cookie_prefs_60k_80k.parquet', 'cookie_prefs_80k_100k.parquet']


100%|██████████| 6/6 [00:00<00:00,  9.85it/s]


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
1,OptanonAlertBoxClosed,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
2,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
3,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
4,we_referring_domain,www.wework.com,14 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True


In [3]:
all_complies_files = list(SCAN_ROOT_DIR.glob('all_complies_*.parquet'))
print(sorted(f.name for f in all_complies_files))
all_complies = read_data_files(all_complies_files)
all_complies.head()

['all_complies_0k_20k.parquet', 'all_complies_100k_200k.parquet', 'all_complies_20k_40k.parquet', 'all_complies_40k_60k.parquet', 'all_complies_60k_80k.parquet', 'all_complies_80k_100k.parquet']


100%|██████████| 6/6 [00:03<00:00,  1.54it/s]


Unnamed: 0,name,domain,path,site,comply,expires,sameSite,secure,value,request_url
0,OptanonAlertBoxClosed,.peoplecert.org,/,peoplecert.org,comply,,,,,
1,nlbi_1958344,.peoplecert.org,/,peoplecert.org,comply,,,,,
2,_hjAbsoluteSessionInProgress,.peoplecert.org,/,peoplecert.org,omit,,,,,
3,_gat_gtag_UA_111398711_1,.peoplecert.org,/,peoplecert.org,incorrect,,,,,
4,_hjIncludedInPageviewSample,www.peoplecert.org,/,peoplecert.org,omit,,,,,


In [4]:
site_to_libname = {row['site']: row['lib_name'] for row in cookie_prefs[['site', 'lib_name']].drop_duplicates().to_dict('records')}
all_complies['lib_name'] = all_complies.site.map(site_to_libname)
all_complies_libs = all_complies.drop_duplicates(subset='site').lib_name.value_counts()
print(all_complies_libs.sum())
all_complies_libs

6050


onetrust     5219
cookiebot     778
termly         53
Name: lib_name, dtype: int64

In [5]:
vc = all_complies[['site', 'lib_name']].drop_duplicates().lib_name.value_counts()
total = vc.sum()
cmp_dist = vc.to_frame().reset_index().rename(columns={'index': 'lib_name', 'lib_name': 'count'})
cmp_dist['percent'] = cmp_dist['count'] / total * 100
cmp_dist

Unnamed: 0,lib_name,count,percent
0,onetrust,5219,86.264463
1,cookiebot,778,12.859504
2,termly,53,0.876033


In [6]:
# Sample 30 sites, stratified on cookie libs
inconsis_sites = all_complies[all_complies.comply == 'incorrect'][['site', 'lib_name']].drop_duplicates()
print(len(inconsis_sites))
inconsis_sites.head()

4976


Unnamed: 0,site,lib_name
3,peoplecert.org,onetrust
60,opentechalliance.com,onetrust
103,forgerock.com,onetrust
109,airship.com,onetrust
119,berluti.com,onetrust


In [7]:
# Distribution of incorrect CMPs 
vc = inconsis_sites.lib_name.value_counts()
total = vc.sum()
inconsis_dist = vc.to_frame().reset_index().rename(columns={'index': 'lib_name', 'lib_name': 'count'})
inconsis_dist['percent'] = inconsis_dist['count'] / total * 100
inconsis_dist

Unnamed: 0,lib_name,count,percent
0,onetrust,4380,88.022508
1,cookiebot,546,10.972669
2,termly,50,1.004823


In [22]:
N = 60
cmp_samples = inconsis_dist.copy()
cmp_samples['samples'] = cmp_samples['percent'].map(lambda percent: max(int(percent / 100 * N), 1))
cmp_samples

Unnamed: 0,lib_name,count,percent,samples
0,onetrust,4380,88.022508,52
1,cookiebot,546,10.972669,6
2,termly,50,1.004823,1


In [23]:
cmp_to_samples = {row['lib_name']: row['samples'] for row in cmp_samples.to_dict('records')}

In [24]:
dfs = []
for lib_name, sites in inconsis_sites.groupby('lib_name'):
    n_samples = cmp_to_samples[lib_name]
    print('Sample', lib_name, n_samples)
    dfs.append(sites.sample(n_samples, random_state=3))
sel = pd.concat(dfs).sort_values(by=['lib_name', 'site'])
sel

Sample cookiebot 6
Sample onetrust 52
Sample termly 1


Unnamed: 0,site,lib_name
3297606,interrail.eu,cookiebot
148114,kitzbueheler-alpen.com,cookiebot
3467938,leadid.com,cookiebot
1444129,medway.gov.uk,cookiebot
3796244,monsterhunter.com,cookiebot
3797797,scandichotels.com,cookiebot
2735369,academyart.edu,onetrust
17820,aspentech.com,onetrust
794013,avetta.com,onetrust
865470,axonify.com,onetrust


In [13]:
print(sel.site.to_list())

['interrail.eu', 'medway.gov.uk', 'monsterhunter.com', 'aspentech.com', 'avetta.com', 'bethpagefcu.com', 'bongino.com', 'colocrossing.com', 'contentsquare.com', 'eppendorf.com', 'fentybeauty.com', 'gameinformer.com', 'glassdoor.ie', 'grooveapp.com', 'hpinc.com', 'iguzzini.com', 'lithium.com', 'longtailvideo.com', 'makeup.com', 'nic.blue', 'nic.gold', 'nic.travel', 'psychiatryadvisor.com', 'remezcla.com', 'samaritans.org', 'snapfish.co.uk', 'southernliving.com', 'stroke.org.uk', 'uwe.ac.uk', 'eskill.com']
