In [1]:
from collections import defaultdict, Counter
from multiprocessing import Pool
from pathlib import Path
from typing import Dict
import json
import re

from tqdm import tqdm
from titlecase import titlecase
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import FIG_DIR, get_scan_dirs, get_scan_root_dir
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_dirs
from consent.data.pref_menu_scan.cookie_decl_reader import read_cookie_decls_in_scans
from consent.util.default_path import get_data_dir
from ooutil.df_util import read_data_files
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain

location = 'de'
SCAN_DIRS = get_scan_dirs(location)
SCAN_ROOT_DIR = get_scan_root_dir(location)

output_suffix = '_0k_100k' # '0k_200k'  # '20k_100k' #'60k_80k' # '40k_60k' # # '100k_200k'; done: '20k_40k' '0k_20k' 

In [2]:
cookie_decls_files = sorted(list(SCAN_ROOT_DIR.glob('cookie_prefs_*.parquet')))
print(f"Cookie declaration files: {[file.name for file in cookie_decls_files]}")
raw_cookie_decls = pd.concat([pd.read_parquet(cookie_decls_file) for cookie_decls_file in cookie_decls_files])
# cookie_decls_file = SCAN_ROOT_DIR / f'cookie_decls{output_suffix}.parquet'
# raw_cookie_decls = pd.read_parquet(cookie_decls_file)
# raw_cookie_decls = read_cookie_decls_in_scans(SCAN_DIRS)
print(f"Num cookie declarations: {len(raw_cookie_decls):,d}")
raw_cookie_decls.head()

Cookie declaration files: ['cookie_prefs_0k_20k.parquet', 'cookie_prefs_20k_100k.parquet']
Num cookie declarations: 761,943


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,user,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
1,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
2,ajs_anonymous_id,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
3,_gclxxxx,wework.com,90 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
4,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True


In [22]:
# Number of always-on un-Necessary cookies. 
def isUnnecessary(category: str):
    necessary_key_words = ["necessary", "require", "mandatory", "essential", "always active", "system"]
    return all(kw not in category.lower() for kw in necessary_key_words)

print(f'Possible consent modes: {raw_cookie_decls.consent_mode.unique().tolist()}')
tmp_df = raw_cookie_decls[raw_cookie_decls.category.map(isUnnecessary) & (raw_cookie_decls.consent_mode == "always active")]
print(f'Number of websites containing always-on unnecessary cookie categories: {tmp_df.site.nunique():,d}')

Possible consent modes: ['always active', 'inactive', None, 'active', 'inactive landingpage', 'do not track']
Number of websites containing always-on unnecessary cookie categories: 467


# Read in the consistency result

In [3]:
all_complies_files = list(SCAN_ROOT_DIR.glob('all_complies_*.parquet'))
print(sorted(f.name for f in all_complies_files))
all_complies = read_data_files(all_complies_files)
all_complies.head()
# all_complies_file = SCAN_ROOT_DIR / f'all_complies{output_suffix}.parquet'
# all_complies = pd.read_parquet(all_complies_file)
# all_complies

['all_complies_0k_20k.parquet', 'all_complies_20k_100k.parquet']


100%|██████████| 2/2 [00:00<00:00, 26.31it/s]


Unnamed: 0,name,domain,path,site,comply
0,JSESSIONID,.accor.com,/,ibis.com,comply
1,_Hw2h_,.accor.com,/,ibis.com,comply
2,xtvrn,.accor.com,/,ibis.com,incorrect
3,userLocalizationInitial,.accor.com,/,ibis.com,incorrect
4,kameleoonVisitorCode,.accor.com,/,ibis.com,omit


In [4]:
all_complies.comply.value_counts()

omit         42917
incorrect    34604
comply       23987
ambiguous      350
Name: comply, dtype: int64

In [5]:
# Extract Incorrect Disclosure cookies
incorr = all_complies[all_complies.comply == 'incorrect']
incorr

Unnamed: 0,name,domain,path,site,comply
2,xtvrn,.accor.com,/,ibis.com,incorrect
3,userLocalizationInitial,.accor.com,/,ibis.com,incorrect
6,dtLatC,.accor.com,/,ibis.com,incorrect
8,xtant577671,.accor.com,/,ibis.com,incorrect
9,rxVisitor,.accor.com,/,ibis.com,incorrect
...,...,...,...,...,...
101849,CMPRO,.casalemedia.com,/,outreach.io,incorrect
101850,_ga,.paymentus.com,/,paymentus.com,incorrect
101853,_gid,.paymentus.com,/,paymentus.com,incorrect
101856,YSC,.youtube.com,/,paymentus.com,incorrect


# How many cookies per category of detected violations?

In [6]:
# cookie_decls = raw_cookie_decls.copy()
# cookie_decls.groupby([])

In [7]:
# With some processing of cookie names
from consent.consistency.util import normalize_cookie_category_name
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.util'])
from consent.consistency.util import normalize_cookie_category_name

cookie_decls = raw_cookie_decls.copy()
cookie_decls['norm_name'] = cookie_decls['category'].map(normalize_cookie_category_name)
cookie_decls = cookie_decls.drop_duplicates()
print(f"Num cookie declarations: {len(cookie_decls):,d}")
cookie_decls.head()

Num cookie declarations: 678,040


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent,norm_name
0,user,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
1,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
2,ajs_anonymous_id,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
3,_gclxxxx,wework.com,90 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary
4,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True,Necessary


In [8]:
cookie_decls.groupby(['site', 'category_id']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
site,category_id,Unnamed: 2_level_1
100percentpure.com,C0001,14
100percentpure.com,C0002,14
100percentpure.com,C0003,533
100percentpure.com,C0004,35
101greatgoals.com,C0001,2
...,...,...
zyngapoker.com,C0004,1
zzounds.com,1,12
zzounds.com,2,11
zzounds.com,3,13


In [9]:
for (site, cat), site_cat_decls in cookie_decls.groupby(['site', 'category_id']):
    print(site, cat, site_cat_decls)
    break

100percentpure.com C0001                       name                      domain   duration category_id  \
0           OptanonConsent         .100percentpure.com   365 days       C0001   
1                 cart_sig      www.100percentpure.com    14 days       C0001   
2      secure_customer_sig      wca.100percentpure.com  7305 days       C0001   
3           _orig_referrer      wca.100percentpure.com    14 days       C0001   
4                 __cfduid  support.100percentpure.com    30 days       C0001   
5            discount_code      www.100percentpure.com    Session       C0001   
6            cart_currency      www.100percentpure.com    14 days       C0001   
7                  cart_ts      www.100percentpure.com    14 days       C0001   
8    OptanonAlertBoxClosed         .100percentpure.com   365 days       C0001   
9                     cart      www.100percentpure.com    14 days       C0001   
10        _shopify_country      www.100percentpure.com    Session       C0001   
11 

In [17]:
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

def check_in_set(site, acookie, cookie_pref_set, verbose=0):
    # check_url_host_match = relax_check_url_host_match # strict_check_url_host_match
    for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']].to_dict('records'):
        if verbose >= 3:
            print(f'{cookie_pref=} {acookie=}')
        elif verbose >= 2:
            if cookie_pref['name'] == acookie['name']:
                print(f'{cookie_pref=} {acookie=}')
        
        if cookie_pref_match(acookie, cookie_pref, site):
            return True
    return False

def does_contain_cookie(site, cookies, decls):
    return any(check_in_set(site, cookie, decls) for cookie in cookies.to_dict('records'))

# Find cookie cateogires with detected incorrect enforcement
site_incor_cookie_decls = cookie_decls[cookie_decls.site.isin(incorr.site)]
assert site_incor_cookie_decls.site.nunique() == incorr.site.nunique(), f'Some sites with detected incorr missing'
n1 = len(cookie_decls.groupby(['site', 'category_id', 'category']))
n2 = len(cookie_decls.groupby(['site', 'category_id']))
if n1 != n2: print(f'WARNING: there is some mismatch (not 1:1 mapping) between cat & cat_id: {n1=} != {n2=}')

print(f"Number of sites with incorrect-enforcement and cookie decls: {site_incor_cookie_decls.site.nunique():,d}")
incor_site_cat_decl_data = []
groups = site_incor_cookie_decls.groupby(['site', 'category'])  # no need to group by category because of 1:1 mapping btw category_name & category_id
for (site, cat), site_cat_decls in tqdm(groups, total=len(groups)):
    site_incor_cookies = incorr[incorr.site == site]
    contain_inconsis = does_contain_cookie(site, site_incor_cookies, site_cat_decls)
    if contain_inconsis:
        # print(site, cat, len(site_cat_decls))
        incor_site_cat_decl_data.append({'site': site, 'category': cat, 'ncookies': len(site_cat_decls) })
    # if len(incor_site_cat_decl_data) > 100: break # test small set

incor_site_cat_decls = pd.DataFrame(incor_site_cat_decl_data)
incor_site_cat_decls.head() # Took 34 min

Number of sites with incorrect-enforcement and cookie decls: 3,936


100%|██████████| 16698/16698 [02:09<00:00, 129.20it/s]


Unnamed: 0,site,category,ncookies
0,100percentpure.com,Functional Cookies,533
1,100percentpure.com,Performance Cookies,14
2,100percentpure.com,Targeting Cookies,35
3,101greatgoals.com,Functional Cookies,7
4,101greatgoals.com,Targeting&Marketing 3rd Cookies,39


In [18]:
incor_site_cat_decls_file = SCAN_ROOT_DIR / f'incor_site_cat_decls{output_suffix}.parquet'
# Uncomment to overwrite.
incor_site_cat_decls.to_parquet(incor_site_cat_decls_file); print(f'Written to {incor_site_cat_decls_file}')

Written to /mnt/sdd/ducbui/projects/data2/consent/2023-04-14/incor_site_cat_decls_0k_100k.parquet


In [19]:
from ooutil.url_util import get_suffixed_domain

# Get number of trackers
site_incor_cookie_decls_with_trackers = site_incor_cookie_decls.copy()
site_incor_cookie_decls_with_trackers['tracker_domain'] = site_incor_cookie_decls_with_trackers['domain'].map(get_suffixed_domain)
groups = site_incor_cookie_decls_with_trackers.groupby(['site', 'category'])
site_cat_tracker_data = []
for (site, cat), site_cat_decls in tqdm(groups, total=len(groups)):
    ntrackers = site_cat_decls.tracker_domain.nunique()
    site_cat_tracker_data.append({'site': site, 'category': cat, 'ntrackers': ntrackers})
site_cat_trackers = pd.DataFrame(site_cat_tracker_data)
site_cat_trackers.head(8)


100%|██████████| 16698/16698 [00:02<00:00, 6430.20it/s]


Unnamed: 0,site,category,ntrackers
0,100percentpure.com,Functional Cookies,24
1,100percentpure.com,Performance Cookies,3
2,100percentpure.com,Strictly Necessary Cookies,3
3,100percentpure.com,Targeting Cookies,15
4,101greatgoals.com,Functional Cookies,1
5,101greatgoals.com,Performance Cookies,1
6,101greatgoals.com,Strictly Necessary Cookies,1
7,101greatgoals.com,Targeting Cookies,4


In [20]:
incor_site_cat_decls = incor_site_cat_decls.merge(site_cat_trackers, on=['site', 'category'], how='left')
incor_site_cat_decls 

Unnamed: 0,site,category,ncookies,ntrackers
0,100percentpure.com,Functional Cookies,533,24
1,100percentpure.com,Performance Cookies,14,3
2,100percentpure.com,Targeting Cookies,35,15
3,101greatgoals.com,Functional Cookies,7,1
4,101greatgoals.com,Targeting&Marketing 3rd Cookies,39,14
...,...,...,...,...
7945,zsl.org,Targeting Cookies,22,7
7946,zynga.com,Performance Cookies,4,1
7947,zzounds.com,Functional Cookies,13,5
7948,zzounds.com,Performance Cookies,11,2


In [21]:
from consent.consistency.util import normalize_cookie_category_name
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.util'])
from consent.consistency.util import normalize_cookie_category_name

incor_site_cat_decls['norm_cat_name'] = incor_site_cat_decls['category'].map(normalize_cookie_category_name)
incor_site_cat_decls

Unnamed: 0,site,category,ncookies,ntrackers,norm_cat_name
0,100percentpure.com,Functional Cookies,533,24,Functional
1,100percentpure.com,Performance Cookies,14,3,Performance
2,100percentpure.com,Targeting Cookies,35,15,Advertising
3,101greatgoals.com,Functional Cookies,7,1,Functional
4,101greatgoals.com,Targeting&Marketing 3rd Cookies,39,14,Advertising
...,...,...,...,...,...
7945,zsl.org,Targeting Cookies,22,7,Advertising
7946,zynga.com,Performance Cookies,4,1,Performance
7947,zzounds.com,Functional Cookies,13,5,Functional
7948,zzounds.com,Performance Cookies,11,2,Performance


In [22]:
# Manual checking how this site contains that many cookies.
incor_site_cat_decls.sort_values(by=['ncookies']).iloc[-1]

site              roche-bobois.com
category         Targeting Cookies
ncookies                      1534
ntrackers                       34
norm_cat_name          Advertising
Name: 5848, dtype: object

In [23]:
cookie_decls[cookie_decls.site == 'deere.com'].groupby('category_id').size()
cookie_decls[cookie_decls.site == 'deere.com'].name.nunique()

0

In [24]:
stats_cookies = incor_site_cat_decls.groupby(['norm_cat_name']).ncookies.describe()
s_cookies = stats_cookies.sort_values(by='count', ascending=False)[:4]

In [25]:
stats_trackers = incor_site_cat_decls.groupby(['norm_cat_name']).ntrackers.describe()
s_trackers = stats_trackers.sort_values(by='count', ascending=False)[:4]

In [26]:
# Overall
o_cookies = incor_site_cat_decls.ncookies.describe()
o_trackers = incor_site_cat_decls.ntrackers.describe()
o_cookies


count    7950.000000
mean       41.721006
std        66.972352
min         1.000000
25%        11.000000
50%        22.000000
75%        47.000000
max      1534.000000
Name: ncookies, dtype: float64

In [27]:
def print_stats(stats, overall_row):
    stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
    for col in ['min', 'max', '25%', '75%']:
        if col in stats:
            stats[col] = stats[col].astype(int)
    stats = stats.rename(columns={'mean': 'Mean', 'std': 'Std.', 'min': 'Min', 'max': 'Max', 'norm_cat_name': 'Category'})
        # split lines into a list
    latex = stats.to_latex(float_format="%.2f", index=False)
    latex_list = latex.splitlines()
    # insert a `\midrule` at third last position in list (which will be the fourth last line in latex output)
    latex_list.insert(len(latex_list)-3, '\midrule')
    # join split lines to get the modified latex output string
    print('\n'.join(latex_list))

print("Cookie stats")
print_stats(s_cookies, o_cookies)
print("Trackers stats")
print_stats(s_trackers, o_trackers)

Cookie stats
\begin{tabular}{lrrrrrr}
\toprule
   Category &  Mean &  Std. &  Min &  25\% &  75\% &  Max \\
\midrule
Advertising & 67.62 & 86.34 &    1 &   22 &   83 & 1534 \\
Performance & 26.95 & 51.06 &    1 &    9 &   30 & 1459 \\
 Functional & 26.75 & 48.26 &    1 &    7 &   28 &  970 \\
 Statistics & 49.59 & 48.44 &    3 &   22 &   57 &  267 \\
\midrule
    Overall & 41.72 & 66.97 &    1 &   11 &   47 & 1534 \\
\bottomrule
\end{tabular}
Trackers stats
\begin{tabular}{lrrrrrr}
\toprule
   Category &  Mean &  Std. &  Min &  25\% &  75\% &  Max \\
\midrule
Advertising & 21.86 & 22.53 &    1 &    8 &   27 &  161 \\
Performance &  4.50 &  6.53 &    1 &    2 &    5 &  111 \\
 Functional &  6.05 &  6.13 &    1 &    2 &    7 &   69 \\
 Statistics &  9.63 &  6.43 &    1 &    5 &   13 &   44 \\
\midrule
    Overall & 11.22 & 16.42 &    1 &    3 &   12 &  161 \\
\bottomrule
\end{tabular}


  stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
  latex = stats.to_latex(float_format="%.2f", index=False)
  stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
  latex = stats.to_latex(float_format="%.2f", index=False)


In [None]:
editable_decls = cookie_decls[((cookie_decls.lib_name == 'onetrust') & (cookie_decls.consent_mode != 'always active')) | ((cookie_decls.lib_name == 'cookiebot') & ~(cookie_decls.category.isin(['necessary', 'unclassified'])))]
editable_decls

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,norm_name
0,s-9da4,go.helpshift.com,7 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
1,_gid,helpshift.com,1 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
2,blog_glossary_page,helpshift.com,7 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
3,__utma,helpshift.com,730 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
4,_ga,helpshift.com,730 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
...,...,...,...,...,...,...,...,...,...,...
138,sp.pl,sp.analytics.yahoo.com,Session,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing
139,_testrideadv,vespa.com,2 years,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing
140,_testrideadv,www.google-analytics.com,2 years,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing
141,_testrideadv_gid,vespa.com,1 day,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing


In [None]:
editable_decls

In [None]:
# Double check
assert all(editable_decls.consent_mode.unique() == ['inactive', 'active', 'leveloptin', 'inactive landingpage','inlineoptin'])
editable_decls[editable_decls.lib_name == 'cookiebot'].category.unique()

array(['preferences', 'statistics', 'marketing'], dtype=object)

# Which consent modes are there for each library?

In [None]:
cookie_decls.lib_name.value_counts()

onetrust     175904
cookiebot     20818
Name: lib_name, dtype: int64

In [None]:
cookie_decls[cookie_decls.lib_name == 'onetrust'].consent_mode.value_counts()

active                  78896
always active           47886
inactive                43397
inactive landingpage     5725
Name: consent_mode, dtype: int64

In [None]:
cookie_decls[cookie_decls.lib_name == 'cookiebot'].consent_mode.value_counts()

leveloptin     12222
inlineoptin     8596
Name: consent_mode, dtype: int64

In [None]:

cookie_decls[cookie_decls.lib_name == 'cookiebot'].category.value_counts()

marketing       8784
statistics      4546
necessary       3958
unclassified    2466
preferences     1064
Name: category, dtype: int64

In [None]:
cookie_decls[(cookie_decls.lib_name == 'onetrust') & (cookie_decls.consent_mode == 'always active')].category.value_counts()

Strictly Necessary Cookies          21511
Functional Cookies                   7880
Targeting Cookies                    4721
Performance Cookies                  2444
Strictly Necessary                   1965
                                    ...  
Audience insights                       1
Facebook Connect                        1
Storage and Access - General            1
Social Media Cookies (CCPA Sale)        1
Functional Cookie                       1
Name: category, Length: 100, dtype: int64