In [1]:
"""Analyze omitted preferences."""
# TODO: the following is heavily copied from the compliance notebook, need to reuse...

from collections import defaultdict, Counter
from multiprocessing import Pool
from pathlib import Path
from typing import Dict
import json
import re

from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


from consent.cmp.comply.cookie_match import cookie_pref_match
from consent.data.site_pref import read_site_prefs
from consent.data.postrej_cookie_reader import parallel_read_postrej_sent_cookies
from consent.util.default_path import get_data_dir
from ooutil.cookie_util import get_brower_cookies
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain


# data_dir = get_data_dir('2021-04-27/pref_menu_scan')
# data_dir = get_data_dir('2021-04-28/pref_menu_scan')
# data_dir = get_data_dir('2021-04-30/pref_menu_scan')
data_dir = get_data_dir('2021-05-03/pref_menu_scan_cleanstart')
assert data_dir.exists()
fig_dir = Path.home() / 'local_projects/consent/paper/sp22/figures'
assert fig_dir.exists() and fig_dir.is_dir()

In [2]:
scanned_sites = [site_dir.name for site_dir in data_dir.glob('*') if site_dir.is_dir()]
site_dirs = [data_dir / site for site in scanned_sites]
print("Number of scanned sites:", len(scanned_sites)) # , scanned_sites)
site_prefs = read_site_prefs(site_dirs)
site_prefs

Number of scanned sites: 227
Contain duplicates: Before 12937, after drop duplicates: 12927


Unnamed: 0,duration,host,name,group_id,site,consent
0,Session,healthgrades.com,_vapi,C0001,healthgrades.com,True
1,365 days,partners.healthgrades.com,OptanonConsent,C0001,healthgrades.com,True
2,a few seconds,healthgrades.com,_dc_gtm_UA-xxxxxxxx,C0001,healthgrades.com,True
3,2914169 days,partners.healthgrades.com,eupubconsent,C0001,healthgrades.com,True
4,365 days,healthgrades.com,OptanonAlertBoxClosed,C0001,healthgrades.com,True
...,...,...,...,...,...,...
12932,390 days,yieldoptimizer.com,ph,SPD_BG,bestwestern.com,False
12933,390 days,yieldoptimizer.com,dph,SPD_BG,bestwestern.com,False
12934,390 days,yieldoptimizer.com,gcma,SPD_BG,bestwestern.com,False
12935,390 days,yieldoptimizer.com,ckid,SPD_BG,bestwestern.com,False


In [3]:
cookielist_sites = sorted(site_prefs.site.unique().tolist())
n_cookielist_sites = len(cookielist_sites)
assert site_prefs.site.nunique() == n_cookielist_sites
print("Sites with cookie list:", n_cookielist_sites, cookielist_sites)

Sites with cookie list: 83 ['accorhotels.com', 'adtelligent.com', 'apachefriends.org', 'behance.net', 'bestwestern.com', 'biomedcentral.com', 'bitnami.com', 'bodybuilding.com', 'britishcouncil.org', 'callofduty.com', 'canon.com', 'cell.com', 'chanel.com', 'christies.com', 'cloudflare.com', 'cnet.com', 'commonsensemedia.org', 'corel.com', 'digicert.com', 'digitalspy.com', 'documentforce.com', 'download.com', 'elsevier.com', 'elsevierhealth.com', 'exacttarget.com', 'exoclick.com', 'fairmont.com', 'fendi.com', 'force.com', 'gamespot.com', 'glassdoor.co.in', 'glassdoor.com', 'goal.com', 'healthgrades.com', 'home.kpmg', 'hotjar.com', 'irishtimes.com', 'kpmg.com', 'magento.com', 'mango.com', 'marketo.com', 'mcdonalds.com', 'media.net', 'mendeley.com', 'metacritic.com', 'myportfolio.com', 'nature.com', 'netflix.com', 'news24.com', 'nflxvideo.net', 'oclc.org', 'onetrust.com', 'pardot.com', 'payoneer.com', 'pendo.io', 'rapidssl.com', 'redbull.com', 'rte.ie', 'salesforce.com', 'scientificamerica

In [4]:
raw_postrej_sent_cookies_file = data_dir / 'raw_postrej_sent_cookies.parquet'
if raw_postrej_sent_cookies_file.exists():
    raw_postrej_sent_cookies = pd.read_parquet(raw_postrej_sent_cookies_file)
else:
    # postrej_sent_cookies = read_postrej_sent_cookies(site_dirs[:4])
    raw_postrej_sent_cookies = parallel_read_postrej_sent_cookies(data_dir)
    raw_postrej_sent_cookies.to_parquet(raw_postrej_sent_cookies_file); print(f"Written to {raw_postrej_sent_cookies_file}")

raw_postrej_sent_cookies.head(3)

Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time
0,.siemens.com,1651627000.0,False,OptanonAlertBoxClosed,/,,True,2021-05-04T01:24:38.671Z,https://www.siemens.com/locales/en-newhome.json,siemens.com,1620091000.0,1620091000.0
1,.siemens.com,-1.0,False,s_cc,/,,False,true,https://assets.new.siemens.com/siemens/assets/...,siemens.com,1620091000.0,1620091000.0
2,.siemens.com,1651627000.0,False,OptanonAlertBoxClosed,/,,True,2021-05-04T01:24:38.671Z,https://www.siemens.com/fonts/SiemensSans_Prof...,siemens.com,1620091000.0,1620091000.0


In [8]:
postrej_sent_cookies = raw_postrej_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
postrej_br_cookies = postrej_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()

In [9]:
print(f"Num raw captured cookies: {len(raw_postrej_sent_cookies)}")
print(f"Num unique captured cookies: {len(postrej_sent_cookies)}")
print(f"Num unique browser cookies: {len(postrej_br_cookies)}")

Num raw captured cookies: 958458
Num unique captured cookies: 562571
Num unique browser cookies: 22177


In [7]:
# Analyze the preference cookie.

In [13]:
pref_cookie = "OptanonConsent"
postrej_br_cookies[postrej_br_cookies.name == pref_cookie][['domain', 'name', 'site']].drop_duplicates()


Unnamed: 0,domain,name,site
5,.siemens.com,OptanonConsent,siemens.com
1712,.healthgrades.com,OptanonConsent,healthgrades.com
3573,.bonappetit.com,OptanonConsent,bonappetit.com
12648,.callofduty.com,OptanonConsent,callofduty.com
24585,.askubuntu.com,OptanonConsent,askubuntu.com
...,...,...,...
933734,.chanel.com,OptanonConsent,chanel.com
935895,.broadcom.com,OptanonConsent,broadcom.com
939576,.epicurious.com,OptanonConsent,epicurious.com
947992,.bestwestern.com,OptanonConsent,bestwestern.com


## Analyze omitted preferences

In [8]:
# Map intercepted cookies to browser cookies.
import sys; import importlib; importlib.reload(sys.modules['consent.cmp.comply.cookie_match'])
from consent.cmp.comply.cookie_match import cookie_pref_match

site_to_contras = {}  # TODO: make this to non-global one.

def check_in_set(site, acookie, cookie_pref_set, verbose=0):
    # check_url_host_match = relax_check_url_host_match # strict_check_url_host_match
    for cookie_pref in cookie_pref_set:
        if verbose >= 3:
            print(f'{cookie_pref=} {acookie=}')
        elif verbose >= 2:
            if cookie_pref['name'] == acookie['name']:
                print(f'{cookie_pref=} {acookie=}')
        
        if cookie_pref_match(acookie, cookie_pref, site):
            return True
    return False


def comply_check(site, acookie, appr_set, rej_set):
    is_appr = check_in_set(site, acookie, appr_set)
    is_rej = check_in_set(site, acookie, rej_set)
    if is_appr and not is_rej:
        return 'comply'
    elif not is_appr and is_rej:
        return 'incorrect'
    elif not is_appr and not is_rej:
        return 'omit'
    else:
        return 'ambiguous'


def get_appr_rej_sets(cookie_consents):
    """Return 2 sets: appr and rejection."""
    appr_set = set()
    rej_set = set()
    for _, row in cookie_consents.iterrows():
        cookie = {'host': row['host'], 'name': row['name']}
        cookie = hashabledict(cookie)
        if row['consent']:
            # assert cookie not in appr_set
            appr_set.add(cookie)
        else:
            # assert cookie not in rej_set
            rej_set.add(cookie)
    contra_set = appr_set.intersection(rej_set)

    return appr_set, rej_set, contra_set


def get_complies_for_site(site, prefs, sent_cookies):
    appr_set, rej_set, contra_set = get_appr_rej_sets(prefs)
    comply_results = []
    for sent_cookie in sent_cookies:
        comply = comply_check(site, sent_cookie, appr_set, rej_set)
        comply_result = sent_cookie.copy()
        comply_result.update({'comply': comply, 'site': site})
        comply_results.append(comply_result)
    if len(contra_set) > 0:
        print(f'Contradictory set: {site=} {contra_set=}')
    site_to_contras[site] = contra_set
    return comply_results


def get_complies(site_prefs, postrej_sent_cookies):
    comply_results = []
    for site in cookielist_sites:
        postrej_sent_cookies_for_site = postrej_sent_cookies[postrej_sent_cookies.site == site].to_dict('records')
        prefs_for_site = site_prefs[site_prefs.site == site]
        complies_for_site = get_complies_for_site(site, prefs_for_site, postrej_sent_cookies_for_site)
        comply_results.extend(complies_for_site)
    return pd.DataFrame(comply_results)

full_complies = get_complies(site_prefs, postrej_sent_cookies)
full_complies

Contradictory set: site='accorhotels.com' contra_set={{'host': 'pullman.accor.com', 'name': 'GoogleAdServingTest'}, {'host': 'ibis.accor.com', 'name': 'GoogleAdServingTest'}, {'host': 'all.accor.com', 'name': 'GoogleAdServingTest'}, {'host': 'hotelf1.accor.com', 'name': 'GoogleAdServingTest'}, {'host': 'mercure.accor.com', 'name': 'GoogleAdServingTest'}, {'host': 'google.com', 'name': 'NID'}, {'host': 'sofitel.accor.com', 'name': 'GoogleAdServingTest'}, {'host': 'restaurants.accor.com', 'name': 'GoogleAdServingTest'}}
Contradictory set: site='businesswire.com' contra_set={{'host': '', 'name': 'JSESSIONID'}, {'host': '', 'name': '__atrfs'}}
Contradictory set: site='cell.com' contra_set={{'host': 'hubspot.net', 'name': '__cfduid'}}
Contradictory set: site='fairmont.com' contra_set={{'host': 'google.com', 'name': 'CONSENT'}, {'host': 'google.com', 'name': 'NID'}}
Contradictory set: site='glassdoor.com' contra_set={{'host': 'glassdoor.com', 'name': 'dc'}}
Contradictory set: site='myportfol

Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time,comply
0,.accor.com,-1.000000e+00,False,userLocalization,/,,False,us,https://all.accor.com/home/assets/icons/sprite...,accorhotels.com,1.620092e+09,1.620092e+09,omit
1,.accor.com,-1.000000e+00,False,_Hw2h_,/,,True,.s76b,https://all.accor.com/assets/fonts/montserrat/...,accorhotels.com,1.620092e+09,1.620092e+09,comply
2,.accor.com,1.651628e+09,False,contribZone,/,,False,usa,https://all.accor.com/services/api-service/ser...,accorhotels.com,1.620092e+09,1.620092e+09,comply
3,.accor.com,-1.000000e+00,False,dtCookie,/,,False,0F29622603FD0260CC6B45D8F0A5005A|QUxMfDE,https://all.accor.com/fstrz/r/stats-euwest1.fz...,accorhotels.com,1.620092e+09,1.620092e+09,omit
4,.accor.com,1.651628e+09,False,userLang,/,,False,en,https://all.accor.com/home/components/booking-...,accorhotels.com,1.620092e+09,1.620092e+09,omit
...,...,...,...,...,...,...,...,...,...,...,...,...,...
363913,.zemanta.com,1.627868e+09,False,_fbp,/,,False,fb.1.1620092010749.1105045950,https://www.zemanta.com/wp-content/themes/Divi...,zemanta.com,1.620092e+09,1.620092e+09,omit
363914,.zemanta.com,1.622684e+09,True,__cfduid,/,Lax,False,d8e20ee2abf2d2c29107747a7b263da461620092008,https://www.zemanta.com/wp-content/plugins/mon...,zemanta.com,1.620092e+09,1.620092e+09,omit
363915,.zemanta.com,1.620178e+09,False,_gid,/,,False,GA1.2.512584698.1620092011,https://www.zemanta.com/wp-content/plugins/con...,zemanta.com,1.620092e+09,1.620092e+09,omit
363916,.zemanta.com,1.620178e+09,False,_gid,/,,False,GA1.2.512584698.1620092011,https://www.zemanta.com/wp-includes/js/mediael...,zemanta.com,1.620092e+09,1.620092e+09,omit


In [9]:
complies = full_complies[ ['name', 'domain', 'site', 'comply',]].drop_duplicates()
comply_counts = complies.comply.value_counts()
# sns.barplot(x=comply_counts.index, y=comply_counts.values)
comply_counts

omit         1450
comply        772
incorrect     726
ambiguous      12
Name: comply, dtype: int64

In [10]:
comply_sites_data = defaultdict(list)
for comply_type, group in complies.groupby('comply'):
    comply_sites_data['comply_type'].append(comply_type)
    comply_sites_data['num_sites'].append(group.site.nunique())
comply_sites = pd.DataFrame(comply_sites_data).sort_values(by=['num_sites'], ascending=False)
comply_sites['num_sites_percent'] = comply_sites['num_sites'] / n_cookielist_sites * 100
comply_sites['num_cookies'] = comply_sites['comply_type'].map(comply_counts)

noncomply_sites = comply_sites[comply_sites.comply_type != 'comply']
comply_sites

Unnamed: 0,comply_type,num_sites,num_sites_percent,num_cookies
3,omit,84,97.674419,1450
1,comply,83,96.511628,772
2,incorrect,66,76.744186,726
0,ambiguous,4,4.651163,12


In [11]:
latex = comply_sites[['comply_type', 'num_cookies', 'num_sites', 'num_sites_percent']].copy()
# latex['num_sites_combine'] = latex.apply(lambda row: f"{row['num_sites']} ({row['num_sites_percent']:.2f}%)", axis=1)
latex['num_sites_combine'] = latex.apply(lambda row: f"{row['num_sites_percent']:.2f}% ({row['num_sites']}/{n_cookielist_sites}))", axis=1)
latex = latex[['comply_type', 'num_cookies', 'num_sites_combine']]
latex = latex.rename(columns={'comply_type': '(Non)compliance Type', 'num_sites_combine': '# Websites', 'num_cookies': '# Cookies'})
latex = latex.replace({'omit': "Omitted Preference", "incorrect": "Incorrect Enforcement", "ambiguous": "Ambiguous Enforcement", "comply": "Complied Enforcement"})
print(latex.to_latex(index=False))

\begin{tabular}{lrl}
\toprule
 (Non)compliance Type &  \# Cookies &      \# Websites \\
\midrule
   Omitted Preference &       1450 & 97.67\% (84/86)) \\
 Complied Enforcement &        772 & 96.51\% (83/86)) \\
Incorrect Enforcement &        726 & 76.74\% (66/86)) \\
Ambiguous Enforcement &         12 &   4.65\% (4/86)) \\
\bottomrule
\end{tabular}

