In [1]:
from collections import defaultdict, Counter
from multiprocessing import Pool
from pathlib import Path
from typing import Dict
import json
import re

from tqdm import tqdm
from titlecase import titlecase
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import FIG_DIR, get_scan_dirs, get_scan_root_dir
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_dirs
from consent.data.pref_menu_scan.cookie_decl_reader import read_cookie_decls_in_scans
from consent.util.default_path import get_data_dir
from ooutil.df_util import read_data_files
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain

location = 'ie'
SCAN_DIRS = get_scan_dirs(location)
SCAN_ROOT_DIR = get_scan_root_dir(location)

output_suffix = '_0k_20k' # '0k_200k'  # '20k_100k' #'60k_80k' # '40k_60k' # # '100k_200k'; done: '20k_40k' '0k_20k'

In [2]:
cookie_decls_files = sorted(list(SCAN_ROOT_DIR.glob('cookie_prefs_*.parquet')))
print(f"Cookie declaration files: {[file.name for file in cookie_decls_files]}")
raw_cookie_decls = pd.concat([pd.read_parquet(cookie_decls_file) for cookie_decls_file in cookie_decls_files])
# cookie_decls_file = SCAN_ROOT_DIR / f'cookie_decls{output_suffix}.parquet'
# raw_cookie_decls = pd.read_parquet(cookie_decls_file)
# raw_cookie_decls = read_cookie_decls_in_scans(SCAN_DIRS)
print(f"Num cookie declarations: {len(raw_cookie_decls):,d}")
raw_cookie_decls.head()

Cookie declaration files: ['cookie_prefs_0k_20k.parquet']
Num cookie declarations: 279,350


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
1,s_xxxx,wework.com,730 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
2,ajs_anonymous_id,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
3,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
4,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True


In [3]:
# Number of always-on un-Necessary cookies.
def isUnnecessary(category: str):
    necessary_key_words = ["necessary", "require", "mandatory", "essential", "always active", "system"]
    return all(kw not in category.lower() for kw in necessary_key_words)

print(f'Possible consent modes: {raw_cookie_decls.consent_mode.unique().tolist()}')
tmp_df = raw_cookie_decls[raw_cookie_decls.category.map(isUnnecessary) & (raw_cookie_decls.consent_mode == "always active")]
print(f'Number of websites containing always-on unnecessary cookie categories: {tmp_df.site.nunique():,d}')

Possible consent modes: ['always active', 'inactive', None, 'active', 'inactive landingpage']


Number of websites containing always-on unnecessary cookie categories: 143


# Read in the consistency result

In [4]:
all_complies_files = list(SCAN_ROOT_DIR.glob('all_complies_*.parquet'))
print(sorted(f.name for f in all_complies_files))
all_complies = read_data_files(all_complies_files)
all_complies.head()
# all_complies_file = SCAN_ROOT_DIR / f'all_complies{output_suffix}.parquet'
# all_complies = pd.read_parquet(all_complies_file)
# all_complies

['all_complies_0k_20k.parquet']


100%|██████████| 1/1 [00:00<00:00, 41.26it/s]


Unnamed: 0,name,domain,path,site,comply
0,ajs_anonymous_id,.wework.com,/,wework.com,comply
1,sa-user-id-v2,www.wework.com,/,wework.com,omit
2,ajs_user_id,.wework.com,/,wework.com,comply
3,OptanonAlertBoxClosed,.www.wework.com,/,wework.com,comply
4,user,www.wework.com,/,wework.com,comply


In [5]:
all_complies.comply.value_counts()

omit         14568
incorrect    11034
comply        7916
ambiguous      124
Name: comply, dtype: int64

In [6]:
# Extract Incorrect Disclosure cookies
incorr = all_complies[all_complies.comply == 'incorrect']
incorr

Unnamed: 0,name,domain,path,site,comply
14,kndctr_AA683BC75245B3880A490D4D_AdobeOrg_cluster,.lumen.com,/,savvis.net,incorrect
19,s_ecid,.lumen.com,/,savvis.net,incorrect
20,kndctr_AA683BC75245B3880A490D4D_AdobeOrg_identity,.lumen.com,/,savvis.net,incorrect
21,kndctr_AA683BC75245B3880A490D4D_AdobeOrg_consent,.lumen.com,/,savvis.net,incorrect
23,gpv_pn,.lumen.com,/,savvis.net,incorrect
...,...,...,...,...,...
33609,_gid,.outreach.io,/,outreach.io,incorrect
33618,_ga_FM692J5MFZ,.outreach.io,/,outreach.io,incorrect
33627,_ga,.outreach.io,/,outreach.io,incorrect
33634,_gid,.pixabay.com,/,pixabay.com,incorrect


# How many cookies per category of detected violations?

In [7]:
# cookie_decls = raw_cookie_decls.copy()
# cookie_decls.groupby([])

In [8]:
# With some processing of cookie names
from consent.consistency.util import normalize_cookie_category_name
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.util'])
from consent.consistency.util import normalize_cookie_category_name

cookie_decls = raw_cookie_decls.copy()
cookie_decls['norm_name'] = cookie_decls['category'].map(normalize_cookie_category_name)
cookie_decls = cookie_decls.drop_duplicates()
print(f"Num cookie declarations: {len(cookie_decls):,d}")
cookie_decls.head()

Num cookie declarations: 255,007


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent,norm_name
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
1,s_xxxx,wework.com,730 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
2,ajs_anonymous_id,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
3,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
4,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary


In [9]:
cookie_decls.groupby(['site', 'category_id']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
site,category_id,Unnamed: 2_level_1
2k.com,1,32
2k.com,2,28
2k.com,3,19
2k.com,4,108
2ksports.com,1,32
...,...,...
zumper.com,C0004,82
zynga.com,C0001,12
zynga.com,C0002,3
zynga.com,C0003,3


In [10]:
for (site, cat), site_cat_decls in cookie_decls.groupby(['site', 'category_id']):
    print(site, cat, site_cat_decls)
    break

2k.com 1                                name                 domain      duration  \
0                           __cf_bm         support.2k.com        0 days   
1                          __cfduid                 2k.com       30 days   
2                          __cfruid         support.2k.com       Session   
3                         _cq_check             nba.2k.com       Session   
4                          _cq_duid                 2k.com       91 days   
5                          _cq_suid                 2k.com       Session   
6               _dc_gtm_UA-xxxxxxxx                 2k.com        0 days   
7                 ASP.NET_SessionId        research.2k.com       Session   
8              ASPSESSIONIDXXXXXXXX        research.2k.com       Session   
9                            AWSELB           merch.2k.com       Session   
10  cf_chl_cc_xxxxxxxxxxxxxxxxxxxxx         support.2k.com       Session   
11                        csrftoken        accounts.2k.com      364 days   
12 

In [11]:
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

def check_in_set(site, acookie, cookie_pref_set, verbose=0):
    # check_url_host_match = relax_check_url_host_match # strict_check_url_host_match
    for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']].to_dict('records'):
        if verbose >= 3:
            print(f'{cookie_pref=} {acookie=}')
        elif verbose >= 2:
            if cookie_pref['name'] == acookie['name']:
                print(f'{cookie_pref=} {acookie=}')

        if cookie_pref_match(acookie, cookie_pref, site):
            return True
    return False

def does_contain_cookie(site, cookies, decls):
    return any(check_in_set(site, cookie, decls) for cookie in cookies.to_dict('records'))

# Find cookie cateogires with detected incorrect enforcement
site_incor_cookie_decls = cookie_decls[cookie_decls.site.isin(incorr.site)]
assert site_incor_cookie_decls.site.nunique() == incorr.site.nunique(), f'Some sites with detected incorr missing'
n1 = len(cookie_decls.groupby(['site', 'category_id', 'category']))
n2 = len(cookie_decls.groupby(['site', 'category_id']))
if n1 != n2: print(f'WARNING: there is some mismatch (not 1:1 mapping) between cat & cat_id: {n1=} != {n2=}')

print(f"Number of sites with incorrect-enforcement and cookie decls: {site_incor_cookie_decls.site.nunique():,d}")
incor_site_cat_decl_data = []
groups = site_incor_cookie_decls.groupby(['site', 'category'])  # no need to group by category because of 1:1 mapping btw category_name & category_id
for (site, cat), site_cat_decls in tqdm(groups, total=len(groups)):
    site_incor_cookies = incorr[incorr.site == site]
    contain_inconsis = does_contain_cookie(site, site_incor_cookies, site_cat_decls)
    if contain_inconsis:
        # print(site, cat, len(site_cat_decls))
        incor_site_cat_decl_data.append({'site': site, 'category': cat, 'ncookies': len(site_cat_decls) })
    # if len(incor_site_cat_decl_data) > 100: break # test small set

incor_site_cat_decls = pd.DataFrame(incor_site_cat_decl_data)
incor_site_cat_decls.head() # Took 34 min

Number of sites with incorrect-enforcement and cookie decls: 1,204


100%|██████████| 5084/5084 [00:33<00:00, 151.28it/s]


Unnamed: 0,site,category,ncookies
0,2k.com,Advertising Cookies,108
1,2ksports.com,Advertising Cookies,108
2,aao.org,Performance Cookies,5
3,aao.org,Targeting Cookies,16
4,acast.com,Performance Cookies,24


In [12]:
incor_site_cat_decls_file = SCAN_ROOT_DIR / f'incor_site_cat_decls{output_suffix}.parquet'
# Uncomment to overwrite.
incor_site_cat_decls.to_parquet(incor_site_cat_decls_file); print(f'Written to {incor_site_cat_decls_file}')

Written to /mnt/sdd/ducbui/projects/data2/consent/2023-11-21/incor_site_cat_decls_0k_20k.parquet


In [13]:
from ooutil.url_util import get_suffixed_domain

# Get number of trackers
site_incor_cookie_decls_with_trackers = site_incor_cookie_decls.copy()
site_incor_cookie_decls_with_trackers['tracker_domain'] = site_incor_cookie_decls_with_trackers['domain'].map(get_suffixed_domain)
groups = site_incor_cookie_decls_with_trackers.groupby(['site', 'category'])
site_cat_tracker_data = []
for (site, cat), site_cat_decls in tqdm(groups, total=len(groups)):
    ntrackers = site_cat_decls.tracker_domain.nunique()
    site_cat_tracker_data.append({'site': site, 'category': cat, 'ntrackers': ntrackers})
site_cat_trackers = pd.DataFrame(site_cat_tracker_data)
site_cat_trackers.head(8)


100%|██████████| 5084/5084 [00:00<00:00, 6275.84it/s]


Unnamed: 0,site,category,ntrackers
0,2k.com,Advertising Cookies,38
1,2k.com,Functional Cookies,6
2,2k.com,Necessary Cookies,4
3,2k.com,Performance Cookies,3
4,2ksports.com,Advertising Cookies,38
5,2ksports.com,Functional Cookies,6
6,2ksports.com,Necessary Cookies,4
7,2ksports.com,Performance Cookies,3


In [14]:
incor_site_cat_decls = incor_site_cat_decls.merge(site_cat_trackers, on=['site', 'category'], how='left')
incor_site_cat_decls

Unnamed: 0,site,category,ncookies,ntrackers
0,2k.com,Advertising Cookies,108,38
1,2ksports.com,Advertising Cookies,108,38
2,aao.org,Performance Cookies,5,1
3,aao.org,Targeting Cookies,16,6
4,acast.com,Performance Cookies,24,1
...,...,...,...,...
2452,zopim.com,Functional Cookies,24,9
2453,zopim.com,Targeting Cookies,48,21
2454,zscaler.com,Performance Cookies,41,12
2455,zscaler.com,Targeting Cookies,24,7


In [15]:
from consent.consistency.util import normalize_cookie_category_name
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.util'])
from consent.consistency.util import normalize_cookie_category_name

incor_site_cat_decls['norm_cat_name'] = incor_site_cat_decls['category'].map(normalize_cookie_category_name)
incor_site_cat_decls

Unnamed: 0,site,category,ncookies,ntrackers,norm_cat_name
0,2k.com,Advertising Cookies,108,38,Advertising
1,2ksports.com,Advertising Cookies,108,38,Advertising
2,aao.org,Performance Cookies,5,1,Performance
3,aao.org,Targeting Cookies,16,6,Advertising
4,acast.com,Performance Cookies,24,1,Performance
...,...,...,...,...,...
2452,zopim.com,Functional Cookies,24,9,Functional
2453,zopim.com,Targeting Cookies,48,21,Advertising
2454,zscaler.com,Performance Cookies,41,12,Performance
2455,zscaler.com,Targeting Cookies,24,7,Advertising


In [16]:
# Manual checking how this site contains that many cookies.
incor_site_cat_decls.sort_values(by=['ncookies']).iloc[-1]

site                   paylocity.com
category         Performance Cookies
ncookies                        1474
ntrackers                         25
norm_cat_name            Performance
Name: 1570, dtype: object

In [17]:
cookie_decls[cookie_decls.site == 'deere.com'].groupby('category_id').size()
cookie_decls[cookie_decls.site == 'deere.com'].name.nunique()

370

In [18]:
stats_cookies = incor_site_cat_decls.groupby(['norm_cat_name']).ncookies.describe()
s_cookies = stats_cookies.sort_values(by='count', ascending=False)[:4]

In [19]:
stats_trackers = incor_site_cat_decls.groupby(['norm_cat_name']).ntrackers.describe()
s_trackers = stats_trackers.sort_values(by='count', ascending=False)[:4]

In [20]:
# Overall
o_cookies = incor_site_cat_decls.ncookies.describe()
o_trackers = incor_site_cat_decls.ntrackers.describe()
o_cookies


count    2457.000000
mean       57.051282
std        95.458429
min         1.000000
25%        14.000000
50%        30.000000
75%        63.000000
max      1474.000000
Name: ncookies, dtype: float64

In [21]:
def print_stats(stats, overall_row):
    stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
    for col in ['min', 'max', '25%', '75%']:
        if col in stats:
            stats[col] = stats[col].astype(int)
    stats = stats.rename(columns={'mean': 'Mean', 'std': 'Std.', 'min': 'Min', 'max': 'Max', 'norm_cat_name': 'Category'})
        # split lines into a list
    latex = stats.to_latex(float_format="%.2f", index=False)
    latex_list = latex.splitlines()
    # insert a `\midrule` at third last position in list (which will be the fourth last line in latex output)
    latex_list.insert(len(latex_list)-3, '\midrule')
    # join split lines to get the modified latex output string
    print('\n'.join(latex_list))

print("Cookie stats")
print_stats(s_cookies, o_cookies)
print("Trackers stats")
print_stats(s_trackers, o_trackers)

Cookie stats
\begin{tabular}{lrrrrrr}
\toprule
   Category &  Mean &   Std. &  Min &  25\% &  75\% &  Max \\
\midrule
Advertising & 96.40 & 129.84 &    1 &   30 &  113 & 1379 \\
Performance & 36.29 &  69.58 &    2 &   11 &   39 & 1474 \\
 Functional & 34.59 &  53.05 &    1 &    9 &   39 &  724 \\
  Analytics & 29.23 &  28.95 &    3 &   13 &   38 &  208 \\
\midrule
    Overall & 57.05 &  95.46 &    1 &   14 &   63 & 1474 \\
\bottomrule
\end{tabular}
Trackers stats
\begin{tabular}{lrrrrrr}
\toprule
   Category &  Mean &  Std. &  Min &  25\% &  75\% &  Max \\
\midrule
Advertising & 29.40 & 29.59 &    1 &   10 &   38 &  272 \\
Performance &  5.32 &  7.64 &    1 &    2 &    6 &  110 \\
 Functional &  7.43 &  8.51 &    1 &    2 &    9 &   86 \\
  Analytics &  6.11 &  6.28 &    1 &    2 &    7 &   30 \\
\midrule
    Overall & 14.63 & 21.76 &    1 &    3 &   16 &  272 \\
\bottomrule
\end{tabular}


  stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
  latex = stats.to_latex(float_format="%.2f", index=False)
  stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
  latex = stats.to_latex(float_format="%.2f", index=False)


In [22]:
editable_decls = cookie_decls[((cookie_decls.lib_name == 'onetrust') & (cookie_decls.consent_mode != 'always active')) | ((cookie_decls.lib_name == 'cookiebot') & ~(cookie_decls.category.isin(['necessary', 'unclassified'])))]
editable_decls

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent,norm_name
12,_hjFirstSeen,wework.com,0 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
13,_uetvid,wework.com,17 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
14,_ga_xxxxxxx,wework.com,729 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
15,_gid,wework.com,1 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
16,_gat_UA-,wework.com,0 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
...,...,...,...,...,...,...,...,...,...,...,...
40,player,vimeo.com,364 days,C0003,Functional Cookies,inactive,pixabay.com,onetrust,en.json,False,Functional
41,vuid,vimeo.com,729 days,C0003,Functional Cookies,inactive,pixabay.com,onetrust,en.json,False,Functional
42,_GRECAPTCHA,www.recaptcha.net,179 days,C0003,Functional Cookies,inactive,pixabay.com,onetrust,en.json,False,Functional
43,IDE,doubleclick.net,729 days,C0004,Targeting Cookies,inactive,pixabay.com,onetrust,en.json,False,Advertising


In [23]:
editable_decls

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent,norm_name
12,_hjFirstSeen,wework.com,0 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
13,_uetvid,wework.com,17 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
14,_ga_xxxxxxx,wework.com,729 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
15,_gid,wework.com,1 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
16,_gat_UA-,wework.com,0 days,C0002,Performance Cookies,inactive,wework.com,onetrust,en.json,False,Performance
...,...,...,...,...,...,...,...,...,...,...,...
40,player,vimeo.com,364 days,C0003,Functional Cookies,inactive,pixabay.com,onetrust,en.json,False,Functional
41,vuid,vimeo.com,729 days,C0003,Functional Cookies,inactive,pixabay.com,onetrust,en.json,False,Functional
42,_GRECAPTCHA,www.recaptcha.net,179 days,C0003,Functional Cookies,inactive,pixabay.com,onetrust,en.json,False,Functional
43,IDE,doubleclick.net,729 days,C0004,Targeting Cookies,inactive,pixabay.com,onetrust,en.json,False,Advertising


In [24]:
# Double check
assert all(editable_decls.consent_mode.unique() == ['inactive', 'active', 'leveloptin', 'inactive landingpage','inlineoptin'])
editable_decls[editable_decls.lib_name == 'cookiebot'].category.unique()

  assert all(editable_decls.consent_mode.unique() == ['inactive', 'active', 'leveloptin', 'inactive landingpage','inlineoptin'])


TypeError: 'bool' object is not iterable

# Which consent modes are there for each library?

In [None]:
cookie_decls.lib_name.value_counts()

onetrust     175904
cookiebot     20818
Name: lib_name, dtype: int64

In [None]:
cookie_decls[cookie_decls.lib_name == 'onetrust'].consent_mode.value_counts()

active                  78896
always active           47886
inactive                43397
inactive landingpage     5725
Name: consent_mode, dtype: int64

In [None]:
cookie_decls[cookie_decls.lib_name == 'cookiebot'].consent_mode.value_counts()

leveloptin     12222
inlineoptin     8596
Name: consent_mode, dtype: int64

In [None]:

cookie_decls[cookie_decls.lib_name == 'cookiebot'].category.value_counts()

marketing       8784
statistics      4546
necessary       3958
unclassified    2466
preferences     1064
Name: category, dtype: int64

In [None]:
cookie_decls[(cookie_decls.lib_name == 'onetrust') & (cookie_decls.consent_mode == 'always active')].category.value_counts()

Strictly Necessary Cookies          21511
Functional Cookies                   7880
Targeting Cookies                    4721
Performance Cookies                  2444
Strictly Necessary                   1965
                                    ...  
Audience insights                       1
Facebook Connect                        1
Storage and Access - General            1
Social Media Cookies (CCPA Sale)        1
Functional Cookie                       1
Name: category, Length: 100, dtype: int64