In [3]:
from collections import defaultdict, Counter
from multiprocessing import Pool
from pathlib import Path
from typing import Dict
import json
import re

from tqdm import tqdm
from titlecase import titlecase
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import FIG_DIR, get_scan_dirs, get_scan_root_dir
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_dirs
from consent.data.pref_menu_scan.cookie_decl_reader import read_cookie_decls_in_scans
from consent.util.default_path import get_data_dir
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain

SCAN_DIRS = get_scan_dirs('eu')
SCAN_ROOT_DIR = get_scan_root_dir('eu')

output_suffix = '_' + '0k_200k'  # '20k_100k' #'60k_80k' # '40k_60k' # # '100k_200k'; done: '20k_40k' '0k_20k' 

In [4]:
cookie_decls_file = SCAN_ROOT_DIR / f'cookie_decls{output_suffix}.parquet'
raw_cookie_decls = pd.read_parquet(cookie_decls_file)
# raw_cookie_decls = read_cookie_decls_in_scans(SCAN_DIRS)
print(f"Num cookie declarations: {len(raw_cookie_decls):,d}")
raw_cookie_decls.head()

Num cookie declarations: 827,875


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
1,OptanonAlertBoxClosed,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
2,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
3,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
4,we_referring_domain,www.wework.com,14 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json


# Read in the consistency result

In [5]:
all_complies_file = SCAN_ROOT_DIR / f'all_complies{output_suffix}.parquet'
all_complies = pd.read_parquet(all_complies_file)
all_complies

Unnamed: 0,name,domain,path,site,comply
0,OptanonAlertBoxClosed,.www.wework.com,/,wework.com,comply
1,ajs_user_id,.wework.com,/,wework.com,incorrect
2,we_referring_domain,www.wework.com,/,wework.com,comply
3,OptanonConsent,.www.wework.com,/,wework.com,comply
4,__we_bucket_id,www.wework.com,/,wework.com,comply
...,...,...,...,...,...
124825,dateIsUS,www.advent.com,/,advent.com,omit
124826,__cf_bm,.info.advent.com,/,advent.com,comply
124827,BIGipServerab17web-nginx-app_https,info.advent.com,/,advent.com,comply
124828,IDE,.doubleclick.net,/,advent.com,incorrect


In [6]:
all_complies.comply.value_counts()

omit         52804
incorrect    43697
comply       27931
ambiguous      398
Name: comply, dtype: int64

In [7]:
# Extract Incorrect Disclosure cookies
incorr = all_complies[all_complies.comply == 'incorrect']
incorr

Unnamed: 0,name,domain,path,site,comply
1,ajs_user_id,.wework.com,/,wework.com,incorrect
5,ajs_anonymous_id,.wework.com,/,wework.com,incorrect
14,_ga,.southernliving.com,/,southernliving.com,incorrect
16,muuid_date,www.southernliving.com,/,southernliving.com,incorrect
17,request_id,www.southernliving.com,/,southernliving.com,incorrect
...,...,...,...,...,...
124807,c,c212.net,/c,spinmasterstudios.com,incorrect
124816,_mkto_trk,.advent.com,/,advent.com,incorrect
124821,_ga,.advent.com,/,advent.com,incorrect
124828,IDE,.doubleclick.net,/,advent.com,incorrect


# How many cookies per category of detected violations?

In [8]:
# cookie_decls = raw_cookie_decls.copy()
# cookie_decls.groupby([])

In [9]:
# With some processing of cookie names
from consent.consistency.util import normalize_cookie_category_name
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.util'])
from consent.consistency.util import normalize_cookie_category_name

cookie_decls = raw_cookie_decls.copy()
cookie_decls['norm_name'] = cookie_decls['category'].map(normalize_cookie_category_name)
cookie_decls = cookie_decls.drop_duplicates()
print(f"Num cookie declarations: {len(cookie_decls):,d}")
cookie_decls.head()

Num cookie declarations: 741,690


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,norm_name
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,Necessary
1,OptanonAlertBoxClosed,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,Necessary
2,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,Necessary
3,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,Necessary
4,we_referring_domain,www.wework.com,14 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,Necessary


In [10]:
cookie_decls.groupby(['site', 'category_id']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
site,category_id,Unnamed: 2_level_1
100percentpure.com,C0001,14
100percentpure.com,C0002,14
100percentpure.com,C0003,533
100percentpure.com,C0004,35
101blockchains.com,advertising,18
...,...,...
zyte.com,C0004,6
zzounds.com,1,14
zzounds.com,2,11
zzounds.com,3,7


In [11]:
for (site, cat), site_cat_decls in cookie_decls.groupby(['site', 'category_id']):
    print(site, cat, site_cat_decls)
    break

100percentpure.com C0001                       name                      domain   duration category_id  \
0           OptanonConsent         .100percentpure.com   365 days       C0001   
1                 cart_sig      www.100percentpure.com    14 days       C0001   
2      secure_customer_sig      wca.100percentpure.com  7305 days       C0001   
3           _orig_referrer      wca.100percentpure.com    14 days       C0001   
4                 __cfduid  support.100percentpure.com    30 days       C0001   
5            discount_code      www.100percentpure.com    Session       C0001   
6            cart_currency      www.100percentpure.com    14 days       C0001   
7                  cart_ts      www.100percentpure.com    14 days       C0001   
8    OptanonAlertBoxClosed         .100percentpure.com   365 days       C0001   
9                     cart      www.100percentpure.com    14 days       C0001   
10        _shopify_country      www.100percentpure.com    Session       C0001   
11 

In [12]:
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

def check_in_set(site, acookie, cookie_pref_set, verbose=0):
    # check_url_host_match = relax_check_url_host_match # strict_check_url_host_match
    for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']].to_dict('records'):
        if verbose >= 3:
            print(f'{cookie_pref=} {acookie=}')
        elif verbose >= 2:
            if cookie_pref['name'] == acookie['name']:
                print(f'{cookie_pref=} {acookie=}')
        
        if cookie_pref_match(acookie, cookie_pref, site):
            return True
    return False

def does_contain_cookie(site, cookies, decls):
    return any(check_in_set(site, cookie, decls) for cookie in cookies.to_dict('records'))

# Find cookie cateogires with detected incorrect enforcement
site_incor_cookie_decls = cookie_decls[cookie_decls.site.isin(incorr.site)]
assert site_incor_cookie_decls.site.nunique() == incorr.site.nunique(), f'Some sites with detected incorr missing'
assert len(cookie_decls.groupby(['site', 'category_id', 'category'])) == len(cookie_decls.groupby(['site', 'category_id'])), 'there is some mismatch (not 1:1 mapping) between cat & cat_id'

print(f"Number of sites with incorrect-enforcement and cookie decls: {site_incor_cookie_decls.site.nunique():,d}")
incor_site_cat_decl_data = []
groups = site_incor_cookie_decls.groupby(['site', 'category'])  # no need to group by category because of 1:1 mapping btw category_name & category_id
for (site, cat), site_cat_decls in tqdm(groups, total=len(groups)):
    site_incor_cookies = incorr[incorr.site == site]
    contain_inconsis = does_contain_cookie(site, site_incor_cookies, site_cat_decls)
    if contain_inconsis:
        # print(site, cat, len(site_cat_decls))
        incor_site_cat_decl_data.append({'site': site, 'category': cat, 'ncookies': len(site_cat_decls) })
    # if len(incor_site_cat_decl_data) > 100: break # test small set

incor_site_cat_decls = pd.DataFrame(incor_site_cat_decl_data)
incor_site_cat_decls.head() # Took 34 min

Number of sites with incorrect-enforcement and cookie decls: 4,973


100%|██████████| 20605/20605 [02:40<00:00, 128.52it/s]


Unnamed: 0,site,category,ncookies
0,100percentpure.com,Functional Cookies,533
1,100percentpure.com,Performance Cookies,14
2,100percentpure.com,Targeting Cookies,35
3,101blockchains.com,advertising,18
4,101blockchains.com,analytics,15


In [14]:
incor_site_cat_decls_file = SCAN_ROOT_DIR / f'incor_site_cat_decls{output_suffix}.parquet'
# Uncomment to overwrite.
incor_site_cat_decls.to_parquet(incor_site_cat_decls_file); print(f'Written to {incor_site_cat_decls_file}')

Written to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/incor_site_cat_decls_0k_200k.parquet


In [15]:
from ooutil.url_util import get_suffixed_domain

# Get number of trackers
site_incor_cookie_decls_with_trackers = site_incor_cookie_decls.copy()
site_incor_cookie_decls_with_trackers['tracker_domain'] = site_incor_cookie_decls_with_trackers['domain'].map(get_suffixed_domain)
groups = site_incor_cookie_decls_with_trackers.groupby(['site', 'category'])
site_cat_tracker_data = []
for (site, cat), site_cat_decls in tqdm(groups, total=len(groups)):
    ntrackers = site_cat_decls.tracker_domain.nunique()
    site_cat_tracker_data.append({'site': site, 'category': cat, 'ntrackers': ntrackers})
site_cat_trackers = pd.DataFrame(site_cat_tracker_data)
site_cat_trackers.head(8)


100%|██████████| 20605/20605 [00:03<00:00, 6241.44it/s]


Unnamed: 0,site,category,ntrackers
0,100percentpure.com,Functional Cookies,24
1,100percentpure.com,Performance Cookies,3
2,100percentpure.com,Strictly Necessary Cookies,3
3,100percentpure.com,Targeting Cookies,15
4,101blockchains.com,advertising,6
5,101blockchains.com,analytics,3
6,101blockchains.com,essential,5
7,101blockchains.com,performance,7


In [16]:
incor_site_cat_decls = incor_site_cat_decls.merge(site_cat_trackers, on=['site', 'category'], how='left')
incor_site_cat_decls 

Unnamed: 0,site,category,ncookies,ntrackers
0,100percentpure.com,Functional Cookies,533,24
1,100percentpure.com,Performance Cookies,14,3
2,100percentpure.com,Targeting Cookies,35,15
3,101blockchains.com,advertising,18,6
4,101blockchains.com,analytics,15,3
...,...,...,...,...
9762,zyte.com,Functional Cookies,9,6
9763,zyte.com,Performance Cookies,24,3
9764,zyte.com,Targeting Cookies,6,4
9765,zzounds.com,Performance Cookies,11,2


In [17]:
from consent.consistency.util import normalize_cookie_category_name
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.util'])
from consent.consistency.util import normalize_cookie_category_name

incor_site_cat_decls['norm_cat_name'] = incor_site_cat_decls['category'].map(normalize_cookie_category_name)
incor_site_cat_decls

Unnamed: 0,site,category,ncookies,ntrackers,norm_cat_name
0,100percentpure.com,Functional Cookies,533,24,Functional
1,100percentpure.com,Performance Cookies,14,3,Performance
2,100percentpure.com,Targeting Cookies,35,15,Advertising
3,101blockchains.com,advertising,18,6,Advertising
4,101blockchains.com,analytics,15,3,Analytics
...,...,...,...,...,...
9762,zyte.com,Functional Cookies,9,6,Functional
9763,zyte.com,Performance Cookies,24,3,Performance
9764,zyte.com,Targeting Cookies,6,4,Advertising
9765,zzounds.com,Performance Cookies,11,2,Performance


In [18]:
# Manual checking how this site contains that many cookies.
incor_site_cat_decls.sort_values(by=['ncookies']).iloc[-1]

site                chamberlain.edu
category         Functional Cookies
ncookies                       2409
ntrackers                        17
norm_cat_name            Functional
Name: 1645, dtype: object

In [19]:
cookie_decls[cookie_decls.site == 'deere.com'].groupby('category_id').size()
cookie_decls[cookie_decls.site == 'deere.com'].name.nunique()

103

In [20]:
stats_cookies = incor_site_cat_decls.groupby(['norm_cat_name']).ncookies.describe()
s_cookies = stats_cookies.sort_values(by='count', ascending=False)[:4]

In [21]:
stats_trackers = incor_site_cat_decls.groupby(['norm_cat_name']).ntrackers.describe()
s_trackers = stats_trackers.sort_values(by='count', ascending=False)[:4]

In [22]:
# Overall
o_cookies = incor_site_cat_decls.ncookies.describe()
o_trackers = incor_site_cat_decls.ntrackers.describe()
o_cookies


count    9767.000000
mean       36.710454
std        64.287804
min         1.000000
25%         9.000000
50%        19.000000
75%        39.000000
max      2409.000000
Name: ncookies, dtype: float64

In [23]:
def print_stats(stats, overall_row):
    stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
    for col in ['min', 'max', '25%', '75%']:
        if col in stats:
            stats[col] = stats[col].astype(int)
    stats = stats.rename(columns={'mean': 'Mean', 'std': 'Std.', 'min': 'Min', 'max': 'Max', 'norm_cat_name': 'Category'})
        # split lines into a list
    latex = stats.to_latex(float_format="%.2f", index=False)
    latex_list = latex.splitlines()
    # insert a `\midrule` at third last position in list (which will be the fourth last line in latex output)
    latex_list.insert(len(latex_list)-3, '\midrule')
    # join split lines to get the modified latex output string
    print('\n'.join(latex_list))

print("Cookie stats")
print_stats(s_cookies, o_cookies)
print("Trackers stats")
print_stats(s_trackers, o_trackers)

Cookie stats
\begin{tabular}{lrrrrrr}
\toprule
   Category &  Mean &  Std. &  Min &  25\% &  75\% &  Max \\
\midrule
Advertising & 60.18 & 80.69 &    1 &   19 &   73 & 1278 \\
Performance & 22.42 & 37.04 &    1 &    8 &   25 &  987 \\
 Functional & 23.83 & 67.23 &    1 &    6 &   24 & 2409 \\
 Statistics & 41.96 & 39.57 &    1 &   18 &   50 &  240 \\
\midrule
    Overall & 36.71 & 64.29 &    1 &    9 &   39 & 2409 \\
\bottomrule
\end{tabular}
Trackers stats
\begin{tabular}{lrrrrrr}
\toprule
   Category &  Mean &  Std. &  Min &  25\% &  75\% &  Max \\
\midrule
Advertising & 19.49 & 19.77 &    1 &    7 &   25 &  160 \\
Performance &  3.99 &  6.02 &    1 &    2 &    5 &  102 \\
 Functional &  5.35 &  5.04 &    1 &    2 &    7 &   39 \\
 Statistics &  8.00 &  5.23 &    1 &    4 &   11 &   50 \\
\midrule
    Overall &  9.90 & 14.46 &    1 &    2 &   11 &  160 \\
\bottomrule
\end{tabular}


  stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
  latex = stats.to_latex(float_format="%.2f", index=False)
  stats = stats.append(overall_row.rename('Overall'))[['mean', 'std', 'min', '25%', '75%', 'max']].reset_index()
  latex = stats.to_latex(float_format="%.2f", index=False)


In [24]:
incor_describe.columns = ['_'.join(col) for col in incor_describe.columns]
incor_describe.reset_index(inplace=True)
incor_describe

NameError: name 'incor_describe' is not defined

In [None]:
incor_describe.sort_values(by=['ncookies_count'], ascending=False)

Unnamed: 0,norm_cat_name,ncookies_count,ncookies_mean,ncookies_std,ncookies_min,ncookies_25%,ncookies_50%,ncookies_75%,ncookies_max,ntracker_count,ntracker_mean,ntracker_std,ntracker_min,ntracker_25%,ntracker_50%,ntracker_75%,ntracker_max
2,Advertising,807.0,79.239157,113.707980,1.0,24.0,50.0,101.0,2129.0,807.0,25.567534,23.574076,1.0,9.0,17.0,37.5,149.0
42,Performance,711.0,28.827004,35.870576,1.0,10.0,19.0,33.0,451.0,711.0,4.883263,6.818090,1.0,2.0,3.0,6.0,102.0
27,Functional,402.0,27.248756,39.112955,1.0,7.0,16.0,29.0,325.0,402.0,5.952736,5.281621,1.0,2.0,4.0,8.0,27.0
35,Marketing,115.0,68.669565,53.263121,4.0,32.5,57.0,88.0,290.0,115.0,17.939130,14.888054,1.0,10.0,15.0,19.5,83.0
66,Statistics,87.0,37.068966,29.773093,4.0,16.0,28.0,46.5,120.0,87.0,7.678161,5.001149,1.0,4.0,7.0,9.5,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,Google Analytics,1.0,5.000000,,5.0,5.0,5.0,5.0,5.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
34,Hotjar,1.0,6.000000,,6.0,6.0,6.0,6.0,6.0,1.0,2.000000,,2.0,2.0,2.0,2.0,2.0
1,Advertisement and Ad Ment,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
37,Marketing Category,1.0,7.000000,,7.0,7.0,7.0,7.0,7.0,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0


In [None]:
editable_decls = cookie_decls[((cookie_decls.lib_name == 'onetrust') & (cookie_decls.consent_mode != 'always active')) | ((cookie_decls.lib_name == 'cookiebot') & ~(cookie_decls.category.isin(['necessary', 'unclassified'])))]
editable_decls

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,norm_name
0,s-9da4,go.helpshift.com,7 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
1,_gid,helpshift.com,1 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
2,blog_glossary_page,helpshift.com,7 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
3,__utma,helpshift.com,730 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
4,_ga,helpshift.com,730 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,Performance
...,...,...,...,...,...,...,...,...,...,...
138,sp.pl,sp.analytics.yahoo.com,Session,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing
139,_testrideadv,vespa.com,2 years,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing
140,_testrideadv,www.google-analytics.com,2 years,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing
141,_testrideadv_gid,vespa.com,1 day,marketing,marketing,inlineoptin,vespa.com,cookiebot,cc.js,Marketing


In [None]:
editable_decls

In [None]:
# Double check
assert all(editable_decls.consent_mode.unique() == ['inactive', 'active', 'leveloptin', 'inactive landingpage','inlineoptin'])
editable_decls[editable_decls.lib_name == 'cookiebot'].category.unique()

array(['preferences', 'statistics', 'marketing'], dtype=object)

# Which consent modes are there for each library?

In [None]:
cookie_decls.lib_name.value_counts()

onetrust     175904
cookiebot     20818
Name: lib_name, dtype: int64

In [None]:
cookie_decls[cookie_decls.lib_name == 'onetrust'].consent_mode.value_counts()

active                  78896
always active           47886
inactive                43397
inactive landingpage     5725
Name: consent_mode, dtype: int64

In [None]:
cookie_decls[cookie_decls.lib_name == 'cookiebot'].consent_mode.value_counts()

leveloptin     12222
inlineoptin     8596
Name: consent_mode, dtype: int64

In [None]:

cookie_decls[cookie_decls.lib_name == 'cookiebot'].category.value_counts()

marketing       8784
statistics      4546
necessary       3958
unclassified    2466
preferences     1064
Name: category, dtype: int64

In [None]:
cookie_decls[(cookie_decls.lib_name == 'onetrust') & (cookie_decls.consent_mode == 'always active')].category.value_counts()

Strictly Necessary Cookies          21511
Functional Cookies                   7880
Targeting Cookies                    4721
Performance Cookies                  2444
Strictly Necessary                   1965
                                    ...  
Audience insights                       1
Facebook Connect                        1
Storage and Access - General            1
Social Media Cookies (CCPA Sale)        1
Functional Cookie                       1
Name: category, Length: 100, dtype: int64