In [1]:
from collections import defaultdict

from tqdm import tqdm
import pandas as pd
import seaborn as sns

from consent.consistency.util import get_scan_root_dir, get_scan_dirs # , FIG_DIR uncomment to save fig.
from ooutil.df_util import read_data_files

pd.options.display.max_rows = 100

SCAN_DIRS = get_scan_dirs('us')
SCAN_ROOT_DIR = get_scan_root_dir('us')

In [2]:
cookie_prefs_files = list(SCAN_ROOT_DIR.glob('cookie_prefs_*.parquet'))
print(sorted(f.name for f in cookie_prefs_files))
cookie_prefs = read_data_files(cookie_prefs_files) # pd.read_parquet(cookie_prefs_file)
cookie_prefs.head()

['cookie_prefs_0k_20k.parquet', 'cookie_prefs_100k_200k.parquet', 'cookie_prefs_20k_100k.parquet']


100%|██████████| 3/3 [00:00<00:00,  6.09it/s]


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,_NS,secure.paybyphone.com,0 days,C0001,Strictly Necessary Cookies,always active,paybyphone.com,onetrust,en.json,True
1,ASP.NET_SessionId,secure.paybyphone.com,Session,C0001,Strictly Necessary Cookies,always active,paybyphone.com,onetrust,en.json,True
2,ClientSettings,secure.paybyphone.com,30 days,C0001,Strictly Necessary Cookies,always active,paybyphone.com,onetrust,en.json,True
3,cookie-primary-language,www.paybyphone.com,90 days,C0001,Strictly Necessary Cookies,always active,paybyphone.com,onetrust,en.json,True
4,geoip-country-code,www.paybyphone.com,90 days,C0001,Strictly Necessary Cookies,always active,paybyphone.com,onetrust,en.json,True


In [3]:
all_complies_files = list(SCAN_ROOT_DIR.glob('all_complies_*.parquet'))
print(sorted(f.name for f in all_complies_files))
all_complies = read_data_files(all_complies_files)
all_complies.head()

['all_complies_0k_20k.parquet', 'all_complies_100k_200k.parquet', 'all_complies_20k_100k.parquet']


100%|██████████| 3/3 [00:00<00:00, 31.86it/s]


Unnamed: 0,name,domain,path,site,comply
0,_gid,.paybyphone.com,/,paybyphone.com,incorrect
1,_ga,.paybyphone.com,/,paybyphone.com,incorrect
2,geoip-country-code,www.paybyphone.com,/,paybyphone.com,comply
3,OptanonConsent,.www.paybyphone.com,/,paybyphone.com,comply
4,_NS,www.paybyphone.com,/,paybyphone.com,omit


In [4]:
nsites = all_complies.site.nunique()
print(f'Number of sites with prefs: {cookie_prefs.site.nunique():,d}')
print(f"Number of sites with consistency: {nsites:,d}")
print(f'Number of flows analyzed: {len(all_complies):,d}')

Number of sites with prefs: 5,083
Number of sites with consistency: 5,050
Number of flows analyzed: 140,279


In [5]:
site_to_libname = {row['site']: row['lib_name'] for row in cookie_prefs[['site', 'lib_name']].drop_duplicates().to_dict('records')}
all_complies['lib_name'] = all_complies.site.map(site_to_libname)
all_complies_libs = all_complies.drop_duplicates(subset='site').lib_name.value_counts()
print(all_complies_libs.sum())
all_complies_libs

5050


onetrust     4373
cookiebot     659
termly         18
Name: lib_name, dtype: int64

In [6]:
# all_complies: comply flows, complies: comply cookies
complies = all_complies[['name', 'domain', 'path', 'site', 'comply']].drop_duplicates()
comply_counts = complies.comply.value_counts()
assert len(complies) == comply_counts.sum(), "Some cookie have 2 different comply values"
print(comply_counts.sum())
comply_counts

140222


omit         70547
incorrect    39446
comply       29884
ambiguous      345
Name: comply, dtype: int64

In [7]:
# Counts violated sites = at least 1 violation detected.
comply_sites_data = defaultdict(list)
for comply_type, comply_group in complies.groupby('comply'):
    n_sites = comply_group.site.nunique()
    comply_sites_data['comply_type'].append(comply_type)
    comply_sites_data['num_sites'].append(n_sites)
    comply_sites = pd.DataFrame(comply_sites_data).sort_values(by=['num_sites'], ascending=False)
    

comply_sites['num_sites_percent'] = comply_sites['num_sites'] / nsites * 100
comply_sites['num_cookies'] = comply_sites['comply_type'].map(comply_counts)
comply_sites['num_cookies_percent'] = comply_sites['num_cookies'] / comply_sites['num_cookies'].sum() * 100  # need to read scan_*.parquet for n_br_cookies, but may be unnecessary
comply_sites['num_cookies_per_site'] = comply_sites['num_cookies'] / comply_sites['num_sites'] 

# Do not count comply/correct enforcement: which require detecting all possible
comply_sites = comply_sites[comply_sites.comply_type != 'comply']
comply_sites

Unnamed: 0,comply_type,num_sites,num_sites_percent,num_cookies,num_cookies_percent,num_cookies_per_site
3,omit,4497,89.049505,70547,50.310936,15.687569
2,incorrect,4134,81.861386,39446,28.131106,9.541848
0,ambiguous,203,4.019802,345,0.246038,1.699507


In [8]:
# Count complying (no violation detected)
n_comply_sites = 0
for site, site_comply in complies.groupby('site'):
    if set(site_comply.comply.unique()) == {'comply'}:
        n_comply_sites += 1
n_comply_sites

267

In [9]:
totals = comply_sites.sum()
n_cookies_analyzed = totals['num_cookies']
print(f'Num cookies analyzed: {n_cookies_analyzed:,d}')
totals

Num cookies analyzed: 110,338


comply_type             omitincorrectambiguous
num_sites                                 8834
num_sites_percent                   174.930693
num_cookies                             110338
num_cookies_percent                   78.68808
num_cookies_per_site                 26.928925
dtype: object

In [10]:
latex = comply_sites[['comply_type', 'num_cookies', 'num_sites', 'num_sites_percent']].copy()
# latex['num_sites_combine'] = latex.apply(lambda row: f"{row['num_sites']} ({row['num_sites_percent']:.2f}%)", axis=1)
latex['num_sites_combine'] = latex.apply(lambda row: f"{row['num_sites_percent']:.2f}% ({row['num_sites']:,d}/{nsites:,d})", axis=1)
latex = latex[['comply_type', 'num_cookies', 'num_sites_combine']]
latex = latex[latex.comply_type != 'comply']
latex = latex.rename(columns={'comply_type': 'Violation Type', 'num_sites_combine': '# Websites', 'num_cookies': '# Cookies'})
latex = latex.replace({'omit': "Omitted Preference", "incorrect": "Incorrect Enforcement", "ambiguous": "Ambiguous Enforcement", "comply": "Correct Enforcement"})
# print(latex.to_latex())
print(latex.style.hide(axis='index').format_index(escape='latex', axis=1).format({'# Cookies': '{:,d}'}, escape='latex').to_latex(hrules=True))
# print(latex_df.style.hide(axis='index').format_index(escape='latex', axis=1).format({"percent": '{:.2f}'}, escape='latex').to_latex(hrules=True))
# print(latex_df.style.format(subset='Opt-out Policy', escape=None).to_latex(hrules=True, multirow_align=True))

\begin{tabular}{lrl}
\toprule
Violation Type & \# Cookies & \# Websites \\
\midrule
Omitted Preference & 70,547 & 89.05\% (4,497/5,050) \\
Incorrect Enforcement & 39,446 & 81.86\% (4,134/5,050) \\
Ambiguous Enforcement & 345 & 4.02\% (203/5,050) \\
\bottomrule
\end{tabular}



In [11]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

raise StopExecution

StopExecution: 

In [None]:
sites = set(complies.site)
vio_sites = set(complies[complies.comply != 'comply'].site)
cor_sites = sites - vio_sites
n_cor_sites = len(cor_sites)
print(f"Num correct sites: {n_cor_sites:,d} ({n_cor_sites} / {len(sites)} = {n_cor_sites / len(sites) * 100:.2f} %)")
print(sorted(list(cor_sites))[:5])


Num correct sites: 360 (360 / 6050 = 5.95 %)
['abus.com', 'accesso.com', 'activelearnprimary.co.uk', 'adcombo.com', 'adtelligent.com']


In [None]:
# Cookie counts on sites
cookie_counts_map = {}
for cookie_name, cookies in complies[complies.comply == 'incorrect'].groupby('name'):
    cookie_counts_map[cookie_name] = cookies.site.nunique()
cookie_counts = pd.DataFrame(cookie_counts_map.items(), columns=['name', 'site_count'])
cookie_counts.sort_values(by='site_count', ascending=False).head(7)

Unnamed: 0,name,site_count
1129,_ga,2542
1724,_gid,2442
1121,_fbp,1374
480,IDE,938
798,YSC,875
771,VISITOR_INFO1_LIVE,867
3317,lang,808


In [None]:
# Export list of incor sites
from consent.util.default_path import create_data_dir
import numpy as np

out_dir = create_data_dir('2022-06-07')
site_list = sorted(list(set(complies[complies.comply == 'incorrect'].site)))
for i, sub_list in enumerate(np.array_split(site_list, 6)):
    out_file =  out_dir/ f'incor_sites_{i}.txt'
    out_file.write_text('\n'.join(sub_list))
    print(f'Written {len(sub_list)} to {out_file}')

Written 829 to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-06-07/incor_sites_0.txt
Written 829 to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-06-07/incor_sites_1.txt
Written 829 to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-06-07/incor_sites_2.txt
Written 829 to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-06-07/incor_sites_3.txt
Written 829 to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-06-07/incor_sites_4.txt
Written 828 to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-06-07/incor_sites_5.txt


# Analyze sent cookies

In [None]:
sent_cookies_files = list(SCAN_ROOT_DIR.glob('scan_*.parquet'))
sent_cookies_dfs = [pd.read_parquet(afile) for afile in tqdm(sent_cookies_files)]
sent_cookies = pd.concat(sent_cookies_dfs)

100%|██████████| 6/6 [00:09<00:00,  1.57s/it]


In [None]:
print(f"(Raw) Num sites with flows: {sent_cookies.site.nunique():,d}")

(Raw) Num sites with flows: 6,086


In [None]:
complies_sites = set(all_complies.site)
sent_cookies_com = sent_cookies[sent_cookies.site.isin(complies_sites)]

In [None]:
prj_sent_cookies_com = sent_cookies_com[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
prj_br_cookies_com = prj_sent_cookies_com[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()
print(f"Num captured sent cookies: {len(sent_cookies_com):,d}")
print(f"Num unique captured cookies: {len(prj_sent_cookies_com):,d}")

n_br_cookies_com = len(prj_br_cookies_com)
print(f"Num unique browser cookies: {n_br_cookies_com:,d} on {prj_br_cookies_com.site.nunique():,d} websites") # and {sent_cookies.page_url.nunique():,d} pages")
assert len(prj_sent_cookies_com[['domain', 'name', 'path', 'site']].drop_duplicates()) == n_cookies_analyzed, 'Number of cookies in all_complies does not match the flows'
print(f"Num cookies: {n_cookies_analyzed:,d} in web pages: {sent_cookies_com.page_url.nunique():,d}")

Num captured sent cookies: 13,364,222
Num unique captured cookies: 7,712,901
Num unique browser cookies: 274,143 on 6,050 websites
Num cookies: 124,830 in web pages: 32,196


In [None]:
print(f'{len(sent_cookies):,d}')

13,458,638


In [None]:
print("Common sites between cookie prefs and cookie captured:", len(set(cookie_prefs.site).intersection(set(sent_cookies.site))))

Common sites between cookie prefs and cookie captured: 6057


In [None]:
decl_cookies = sent_cookies_com[['name', 'domain', 'path', 'site']].drop_duplicates()
len(decl_cookies)

124830

# Analyze contradictions

In [None]:
# (Testing) Way 1: compute contra sites by dynamic analysis: this should be lower than statically analyzing prefs
from consent.consistency.comply_util import get_appr_rej_sets
contra_data = []
for asite in cookie_prefs.site.unique():
    site_prefs = cookie_prefs[cookie_prefs.site == asite]
    _, _, contras = get_appr_rej_sets(site_prefs)
    for contra in contras:
        contra['site'] = asite
        contra_data.append(contra)
contra_sites = pd.DataFrame(contra_data)
contra_sites.head()

In [None]:
# (use this) Way 2: compute contra sites by DataFrame group by
contra_cookies_dfs = []
n_always_active_cookies = 0
for _, same_cookies in cookie_prefs.groupby(['name', 'domain', 'site']):
    consent_modes = same_cookies.consent_mode.unique()
    if len(consent_modes) >= 2 and same_cookies.category.nunique() > 1: # and 'always active' in consent_modes:
        contra_cookies_dfs.append(same_cookies)
        if 'always active' in consent_modes:
            n_always_active_cookies += 1
contra_cookies = pd.concat(contra_cookies_dfs).drop_duplicates()
contra_cookies.head()

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
716434,.ASPXANONYMOUS,professionals.muuto.com,69 days,1,Strictly Necessary Cookies,always active,muuto.com,onetrust,en.json,True
716449,.ASPXANONYMOUS,professionals.muuto.com,69 days,2,Performance Cookies,inactive,muuto.com,onetrust,en.json,False
362744,.EPiForm_BID,www.epicor.com,89 days,C0001,Strictly Necessary Cookies,always active,epicor.com,onetrust,en.json,True
362839,.EPiForm_BID,www.epicor.com,89 days,C0003,Functional Cookies,inactive,epicor.com,onetrust,en.json,False
795321,.EPiForm_BID,www.epicor.com,89 days,C0001,Strictly Necessary Cookies,always active,mechanicnet.com,onetrust,en.json,True


In [None]:
contra_cookies[contra_cookies.site == 'cathkidston.com']

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
746270,weird_get_top_level_domain,cathkidston.com,Session,C0001,Strictly Necessary Cookies,always active,cathkidston.com,onetrust,en.json,True
746284,weird_get_top_level_domain,cathkidston.com,Session,C0003,Functional Cookies,inactive,cathkidston.com,onetrust,en.json,False


In [None]:
# Sample contradiction for manual verification.
N = 30
dfs = []
sample_sites = set(contra_cookies.site.sample(30, random_state=3))
for site, site_contra_cookies in contra_cookies.groupby('site'):
    if site not in sample_sites:
        continue
    dfs.append(site_contra_cookies[:2])
contra_samples = pd.concat(dfs)
contra_samples_file = SCAN_ROOT_DIR / 'contra_sample.csv'
contra_samples.to_csv(contra_samples_file); print(f'Written to {contra_samples_file}')

Written to /mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/contra_sample.csv


In [None]:
contra_cookies.consent_mode.value_counts()

always active           1082
inactive                 932
active                   202
inactive landingpage      16
Name: consent_mode, dtype: int64

In [None]:
n_contra_settings = len(contra_cookies_dfs)
print(contra_sites.site.nunique(), contra_cookies.site.nunique(), n_always_active_cookies, n_contra_settings)

681 676 1081 1107


In [None]:
print("Number sites with contra cookies:", contra_cookies.site.nunique())

Number sites with contra cookies: 676


In [None]:
print(f"Num contra cookie settings (multiple categories): {n_contra_settings:,d} settings on {contra_cookies.site.nunique()} sites")
print(f"Contras with 1 always-active: {n_always_active_cookies}")
print(f"Contras with both editable: {n_contra_settings - n_always_active_cookies:,d}")

Num contra cookie settings (multiple categories): 1,107 settings on 676 sites
Contras with 1 always-active: 1081
Contras with both editable: 26
