In [1]:
from multiprocessing import Pool
from pathlib import Path

from dask.distributed import Client, progress
Client()

from dask.diagnostics import ProgressBar
from tqdm import tqdm
import pandas as pd
import dask.dataframe as dd

from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import SCAN_DIRS, FIG_DIR, SCAN_ROOT_DIR
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_scans
from consent.data.pref_menu_scan.log_reader import read_logs_in_scans
from consent.data.pref_menu_scan.cookie_decl_reader import read_cookie_decls_in_scans
from consent.data.pref_menu_scan.postrej_cookie_reader import read_postrej_sent_cookies_in_scans
from consent.data.pref_menu_scan.cat_pref_reader import read_cat_prefs_in_dirs
from consent.util.default_path import get_data_dir
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain

scan_root_dir = get_data_dir('2021-08-12')
scan_dirs = [scan_root_dir / 'pref_menu_scan_0k_10k',
             scan_root_dir / 'pref_menu_scan_10k_20k',
             scan_root_dir / 'pref_menu_scan_20k_30k']
assert all(scan_dir.exists() for scan_dir in scan_dirs)

fig_dir = Path.home() / 'local_projects/consent/paper/sp22/figures'
assert fig_dir.exists() and fig_dir.is_dir()

distributed.diskutils - INFO - Found stale lock file and directory '/home/ducbui/Dropbox (University of Michigan)/projects/consent/consent_project/src/consent/consistency/dask-worker-space/worker-7ntgmzxo', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ducbui/Dropbox (University of Michigan)/projects/consent/consent_project/src/consent/consistency/dask-worker-space/worker-wy3e8l28', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ducbui/Dropbox (University of Michigan)/projects/consent/consent_project/src/consent/consistency/dask-worker-space/worker-k81t0_0f', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ducbui/Dropbox (University of Michigan)/projects/consent/consent_project/src/consent/consistency/dask-worker-space/worker-707bnc9x', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ducbui/Dropbox (University of Michigan)/projects/consent/consent_p

In [2]:
# Read in cookie declaration
# Number cookie declarations per websites
raw_cookie_prefs = read_cookie_prefs_in_scans(SCAN_DIRS)
raw_cookie_prefs.head()

Fail to read log 0 sites: []
Failed to read cookies on 5: ['eero.com', 'eurail.com', 'fhi360.org', 'marieforleo.com', 'reiss.com']


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,s-9da4,go.helpshift.com,7 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,False
1,_gid,helpshift.com,1 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,False
2,blog_glossary_page,helpshift.com,7 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,False
3,__utma,helpshift.com,730 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,False
4,_ga,helpshift.com,730 days,C0002,Performance Cookies,inactive,helpshift.com,onetrust,en.json,False


In [3]:
# Read in cookie transfer 
import sys; import importlib; importlib.reload(sys.modules['consent.data.pref_menu_scan.postrej_cookie_reader'])
from consent.data.pref_menu_scan.postrej_cookie_reader import read_postrej_sent_cookies_in_scans


overwrite = False
cookies_cache_file = SCAN_ROOT_DIR / 'raw_postrej_sent_cookies.parquet'

if not overwrite and cookies_cache_file and cookies_cache_file.exists():
    sent_cookies = pd.read_parquet(cookies_cache_file)
else:
    sent_cookies = read_postrej_sent_cookies_in_scans(SCAN_DIRS)
    if cookies_cache_file: sent_cookies.to_parquet(cookies_cache_file); print(f"Written to {cookies_cache_file}")

print(f"Number sent cookies read: {len(sent_cookies):,d}")
sent_cookies.head(3)

Number sent cookies read: 5,196,675


Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time,page_url
0,.apachefriends.org,1629067000.0,False,_gid,/,,False,GA1.2.720720591.1628980892,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
1,.apachefriends.org,1660517000.0,False,OptanonAlertBoxClosed,/,Lax,False,2021-08-14T22:42:17.927Z,https://www.apachefriends.org/javascripts/all-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
2,.apachefriends.org,1660517000.0,False,OptanonConsent,/,Lax,False,isIABGlobal=false&datestamp=Sat+Aug+14+2021+22...,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html


In [4]:
# check duplicates in cookie preferences
cookie_prefs = raw_cookie_prefs.drop_duplicates()
print(f'There are {len(cookie_prefs):,d} unique_cookie_prefs ({len(raw_cookie_prefs):,d} preferences with duplicates)')
print(f"{len(set(sent_cookies.site) - set(cookie_prefs.site))} sites in sent cookies but do not have preferences.")

There are 196,722 unique_cookie_prefs (221,671 preferences with duplicates)
5 sites in sent cookies but do not have preferences.


In [5]:
# Sample a subset, from this point, use s_cookie_prefs
n_samples = len(cookie_prefs) # 100
sample_sites = cookie_prefs.sample(n_samples, random_state=1024).site.unique()
n_sites = cookie_prefs.site.nunique()
print(f"Num sample sites: {len(sample_sites):,d} {len(sample_sites) / n_sites*100:.2f}% of all {n_sites} sites.")

s_cookie_prefs = cookie_prefs[cookie_prefs.site.isin(sample_sites)].drop_duplicates()

Num sample sites: 1,361 100.00% of all 1361 sites.


In [6]:
# Map and detect ambiguity; note: prj = post-rejection, from this point, use prj_sent_cookies and prj_br_cookies
s_sent_cookies = sent_cookies[sent_cookies.site.isin(sample_sites)]
prj_sent_cookies = s_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
prj_br_cookies = prj_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()

In [7]:
print(f"Num captured sent cookies: {len(s_sent_cookies):,d}")
print(f"Num unique captured cookies: {len(prj_sent_cookies):,d}")
n_br_cookies = len(prj_br_cookies)
print(f"Num unique browser cookies: {n_br_cookies:,d} on {prj_br_cookies.site.nunique():,d} websites and {s_sent_cookies.page_url.nunique():,d} pages")

Num captured sent cookies: 5,184,003
Num unique captured cookies: 3,120,260
Num unique browser cookies: 75,529 on 1,355 websites and 7,721 pages


# Find number of cookie preferences that match a browser cookie.

In [8]:
# Select the strategy with the lowest ambiguity score.

import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

def get_n_pref_matches(cookie, cookie_prefs):
    n_matches = 0
    for pref in cookie_prefs[(cookie_prefs.site == cookie['site']) & (cookie_prefs.name == cookie['name'])][['domain', 'name']].drop_duplicates().to_dict('records'):
        if cookie_pref_match(cookie, pref, cookie['site']):
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
            n_matches += 1
    return n_matches

prefs = s_cookie_prefs[['name', 'domain', 'category_id', 'category', 'consent_mode', 'site']].drop_duplicates()
print(f"Num browser cookies: {len(prj_br_cookies):,d}, Num preferences: {len(prefs):,d}")

Num browser cookies: 75,529, Num preferences: 196,669


In [12]:
def get_n_cookie_matches(pref, cookies):
    n_matches = 0
    for cookie in cookies[(cookies.site == pref['site']) & (cookies.name == pref['name'])][['domain', 'name']].drop_duplicates().to_dict('records'):
        if cookie_pref_match(cookie, pref, pref['site']):
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
            n_matches += 1
    return n_matches

tqdm.pandas()
n_cookie_matches = prefs.progress_apply(lambda row: get_n_cookie_matches(row, prj_br_cookies), axis=1) #, meta=('n_matches', 'int64'))
# test = prefs.sample(100)  # test.apply(lambda row: get_n_cookie_matches(row, prj_br_cookies), axis=1) # , meta=('n_matches', 'int64'))
# prefs_ddf = dd.from_pandas(prefs, npartitions=32)
# y = prefs_ddf.apply(lambda row: get_n_cookie_matches(row, prj_br_cookies), axis=1, meta=('n_matches', 'int64'))
# y = y.persist()  # start computation in the background
# progress(y)      # watch progress  # Cannot be interrupted :(
# prefs['n_cookie_matches'] = y.compute()

  2%|▏         | 3925/196669 [00:45<39:03, 82.24it/s]tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <zmq.eventloop.ioloop.ZMQIOLoop object at 0x7f1b68031b50>>, <Task finished name='Task-2618' coro=<Cluster._sync_cluster_info() done, defined at /home/ducbui/anaconda3/envs/consent/lib/python3.8/site-packages/distributed/deploy/cluster.py:104> exception=OSError('Timed out during handshake while connecting to tcp://127.0.0.1:38293 after 30 s')>)
Traceback (most recent call last):
  File "/home/ducbui/anaconda3/envs/consent/lib/python3.8/asyncio/tasks.py", line 465, in wait_for
    fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/ducbui/anaconda3/envs/consent/lib/python3.8/site-packages/distributed/comm/core.py", line 320, in connect
    await asyncio.wait_for(comm.write(local_info), time_left())
  File "/

In [14]:
prefs['n_cookie_matches'] = n_cookie_matches
vc = prefs.n_cookie_matches.value_counts()
print("Distribution of preference -> n cookie matches:")
vc

Distribution of preference -> n cookie matches:


0    173088
1     23527
2        52
3         2
Name: n_cookie_matches, dtype: int64

In [None]:
n_multi = vc[vc.index > 1].sum()
n_uniq = vc[1]
print(f"Number of ambiguous mapping (1 preference -> multiple cookies): {n_multi=:,d} {n_uniq=:,d}: {n_multi/n_uniq*100:.2f}")

In [9]:
# Dask single machine 57 sec, 35 min single core, 6min min on 32 cores, progress bar not show progressively
prj_br_cookies_ddf = dd.from_pandas(prj_br_cookies, npartitions=32)
x = prj_br_cookies_ddf.apply(lambda row: get_n_pref_matches(row, prefs), axis=1, meta=('n_matches', 'int64'))
x = x.persist()  # start computation in the background
progress(x)      # watch progress  # Cannot be interrupted :(


VBox()

In [10]:
prj_br_cookies['n_pref_matches'] = x.compute() # convert to final result, instantly

In [11]:
prj_br_cookies.n_pref_matches.value_counts()

1    38530
0    35595
2     1384
4       15
3        5
Name: n_pref_matches, dtype: int64

In [12]:
vc = prj_br_cookies.n_pref_matches.value_counts()
n_multi = vc[vc.index > 1].sum()
n_uniq = vc[1]
print(f"Number of ambiguous mapping (1 cookie -> multiple preferences): {n_multi=:,d} {n_uniq=:,d}: {n_multi/n_uniq*100:.2f}")

Number of ambiguous mapping (1 cookie -> multiple preferences): n_multi=1,404 n_uniq=38,530: 3.64


In [None]:
# How many cookies that have same both domain and name?
# print("Num cookies preferences:", len(s_cookie_prefs))
# s_cookies = s_cookie_prefs[['domain', 'name', 'category', 'site']].drop_duplicates()
# print("Num unique cookie preferences:", len(s_cookies))

In [None]:
# plain pandas, 27-29 secs for sample(100)
# prj_br_cookies['n_matches'] = prj_br_cookies.apply(lambda row: get_n_matches(row, prefs), axis=1)

In [None]:
# s_cookie_prefs

# Analyze duplicate cookies

In [None]:
# s_cookies.value_counts()[:5]

domain             name              category                    site           
bat.bing.com       _uetvid           marketing                   justgiving.com     2
script.hotjar.com  _hjid             statistics                  icaew.com          2
cdn.segment.com    ajs_anonymous_id  statistics                  messagebird.com    2
login.siemens.com  _csrf             Strictly necessary cookies  mentor.com         2
                                                                 siemens.com        2
dtype: int64

In [None]:
# Case studies
# s_cookie_prefs[(s_cookie_prefs.domain == 'mc.yandex.ru') & (s_cookie_prefs.name == '_ym_uid')]
# s_cookie_prefs[(s_cookie_prefs.domain == 'bat.bing.com') & (s_cookie_prefs.name == '_uetsid')]
# s_cookie_prefs[(s_cookie_prefs.domain == 'start.stepchange.org') & (s_cookie_prefs.name == '_uetvid')]

In [None]:
def get_n_cookie_matches(cookies, pref):
    matched_cookies = set()
    for cookie in cookies.to_dict('records'):
        if cookie_pref_match(cookie, pref, cookie['site']):
            matched_cookies.add(hashabledict({'name': cookie['name'], 'domain': cookie['domain']}))
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
    return len(matched_cookies)

# s_cookie_prefs2 = s_cookie_prefs.copy()
# s_cookie_prefs2['n_matches'] = s_cookie_prefs2.apply(lambda pref: get_n_cookie_matches(prj_sent_cookies, pref), axis=1)
# s_cookie_prefs2.sort_values(by='n_matches', ascending=False)

In [None]:
# s_cookie_prefs2.n_matches.value_counts()

In [None]:
# prj_sent_cookies.head()

In [None]:
# s_cookie_prefs

In [None]:
# s_cookie_prefs = s_cookie_prefs[s_cookie_prefs.name == 'TAUnique']
# s_cookie_prefs 

In [None]:
# print('Num websites:', all_complies.site.nunique())
# all_complies.comply.unique()

In [None]:
# s_cookie_prefs

In [None]:
# Map intercepted cookies to browser cookies.
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

site_to_contras = {}  # TODO: make this to non-global one.

def check_in_set(site, acookie, cookie_pref_set, verbose=0):
    # check_url_host_match = relax_check_url_host_match # strict_check_url_host_match
    for cookie_pref in cookie_pref_set:
        if verbose >= 3:
            print(f'{cookie_pref=} {acookie=}')
        elif verbose >= 2:
            if cookie_pref['name'] == acookie['name']:
                print(f'{cookie_pref=} {acookie=}')
        
        if cookie_pref_match(acookie, cookie_pref, site):
            return True, cookie_pref
    return False, None

def get_comply_type(is_appr, is_rej):
    if is_appr and not is_rej:
        return 'comply'
    if not is_appr and is_rej:
        return 'incorrect'
    if not is_appr and not is_rej:
        return 'omit'
    return 'ambiguous'

def get_appr_rej_sets(prefs):
    """Return 2 sets: appr and rejection."""
    def get_hashable_cookie_set(df):
        cookies = df[['domain', 'name']].to_dict('records')
        return set(hashabledict(c) for c in cookies)
    
    appr_set = get_hashable_cookie_set(prefs[prefs.consent == True])
    rej_set = get_hashable_cookie_set(prefs[prefs.consent == False])
    assert len(prefs[~prefs.consent.isin([True, False])]) == 0
    
    contra_set = appr_set.intersection(rej_set)

    return appr_set, rej_set, contra_set

def update_appr_rej_pref(comply_result, appr_pref, rej_pref):
    name_to_pref = {'appr_pref': appr_pref, 'rej_pref': rej_pref}
    for name, pref in name_to_pref.items():
        for key in ['domain', 'name']:
            comply_result[name + '_' + key] = pref[key] if pref else None
    
def _get_comply_for_site(site, prefs, sent_cookies):
    appr_set, rej_set, contra_set = get_appr_rej_sets(prefs)
    comply_results = []
    for sent_cookie in sent_cookies:
        is_appr, appr_pref = check_in_set(site, sent_cookie, appr_set)
        is_rej, rej_pref = check_in_set(site, sent_cookie, rej_set)
        comply = get_comply_type(is_appr, is_rej)
        comply_result = sent_cookie.copy()
        update_appr_rej_pref(comply_result, appr_pref, rej_pref)
        # comply_result.update({'comply': comply, 'site': site})
        assert site == sent_cookie['site']
        comply_result['comply'] = comply # .update({'comply': comply, 'site': site})
        comply_results.append(comply_result)
    if len(contra_set) > 0:
        site_to_contras[site] = contra_set
        if len(site_to_contras) < 20: # Print some of the contra to see the progress only
            print(f'Contradictory set: {site=} {contra_set=}')    
    return comply_results

def get_comply_for_sites(args, sites, parallel=False):
    if parallel: # not work, maybe bottleneck is the transfer of a big data frame.
        pool = Pool(32)
        for result in pool.starmap(_get_comply_for_site, args):
            yield result
    else:
        for arg in tqdm(args, total=len(sites)):
            yield _get_comply_for_site(*arg)
            
def get_compute_args(sites, cookie_prefs, prj_sent_cookies):
    #     return [(site, cookie_prefs, prj_sent_cookies) for site in sites]
    for site in sites:
        site_cookie_prefs = cookie_prefs[cookie_prefs.site == site]
        site_prj_sent_cookies = prj_sent_cookies[prj_sent_cookies.site == site].to_dict('records')   
        yield site, site_cookie_prefs, site_prj_sent_cookies
            
def get_comply(cookie_prefs, prj_sent_cookies):
    sites = cookie_prefs.site.unique() # .tolist()
#     sites = ['suse.com', 'ulta.com', 'optimizely.com', 'cell.com']
    args = get_compute_args(sites, cookie_prefs, prj_sent_cookies)
    
    comply_results = []    
    for complies_for_site in get_comply_for_sites(args, sites, parallel=False):
        comply_results.extend(complies_for_site)
        
    return pd.DataFrame(comply_results)

print("faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:")
all_complies = get_comply(s_cookie_prefs, prj_sent_cookies)
# print("Number of incor"
all_complies[all_complies.comply == 'incorrect']

faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:


100%|██████████| 1/1 [00:00<00:00,  3.03it/s]


Unnamed: 0,domain,expires,name,path,sameSite,secure,value,request_url,site,appr_pref_domain,appr_pref_name,rej_pref_domain,rej_pref_name,comply
87,.facebook.com,1636636000.0,fr,/,,True,03iP1PTdTj8voPVm9..BhFmy3...1.0.BhFmy3.,https://www.facebook.com/x/oauth/status?client...,qobuz.com,,,facebook.com,fr,incorrect
216,.facebook.com,1636636000.0,fr,/,,True,03iP1PTdTj8voPVm9..BhFmy3...1.0.BhFmy3.,https://www.facebook.com/tr/?id=103352876679&e...,qobuz.com,,,facebook.com,fr,incorrect
430,.doubleclick.net,1691932000.0,IDE,/,,True,AHWqTUnUP2aXIJDylDCdEZ4S-km-U_5vD6hVqn3D4PBWy-...,https://cm.g.doubleclick.net/pixel?google_nid=...,qobuz.com,,,doubleclick.net,IDE,incorrect
527,.adnxs.com,1636636000.0,anj,/,,True,dTM7k!M4/8D>6NRF']wIg2Ilkj+qOP!fss0=Ir4A3KL9D3...,https://secure.adnxs.com/getuid?https%3A%2F%2F...,qobuz.com,,,adnxs.com,anj,incorrect
693,.adnxs.com,1636636000.0,uuid2,/,,True,6462288379355314650,https://secure.adnxs.com/getuid?https%3A%2F%2F...,qobuz.com,,,adnxs.com,uuid2,incorrect
705,.addthis.com,1663074000.0,loc,/,,True,NDgxMDlOQVVTTUkyMjIzMDk2MzUwNTAwMDBDSA==,https://m.addthis.com/live/red_lojson/300lo.js...,qobuz.com,,,addthis.com,loc,incorrect
849,.addthis.com,1663074000.0,loc,/,,True,NDgxMDlOQVVTTUkyMjIzMDk2MzUwNTAwMDBDSA==,https://s7.addthis.com/static/sh.f48a1a04fe8db...,qobuz.com,,,addthis.com,loc,incorrect
1276,.addthis.com,1663074000.0,loc,/,,True,NDgxMDlOQVVTTUkyMjIzMDk2MzUwNTAwMDBDSA==,https://s7.addthis.com/js/250/addthis_widget.js,qobuz.com,,,addthis.com,loc,incorrect
3138,.facebook.com,1636636000.0,fr,/,,True,03iP1PTdTj8voPVm9..BhFmy3...1.0.BhFmy3.,https://www.facebook.com/x/oauth/status?client...,qobuz.com,,,facebook.com,fr,incorrect
4229,.facebook.com,1636636000.0,fr,/,,True,03iP1PTdTj8voPVm9..BhFmy3...1.0.BhFmy3.,https://www.facebook.com/tr/?id=103352876679&e...,qobuz.com,,,facebook.com,fr,incorrect


In [None]:
all_complies

Unnamed: 0,domain,expires,name,path,sameSite,secure,value,request_url,site,comply
0,.sevillafc.es,1.629454e+09,OPTAW_gaCookie_gid,/,,False,GA1.2.538998216.1629368093,https://www.sevillafc.es/sites/default/files/2...,sevillafc.es,comply
1,.sevillafc.es,1.629454e+09,OPTAW_gaCookie_gid,/,,False,GA1.2.538998216.1629368093,https://www.sevillafc.es/sites/default/files/2...,sevillafc.es,comply
2,.sevillafc.es,1.692440e+09,OPTAW_gaCookie,/,,False,GA1.2.108583041.1629368093,https://www.sevillafc.es/modules/contrib/langu...,sevillafc.es,comply
3,www.sevillafc.es,1.660904e+09,CookieConsent,/,,True,{stamp:%27Y+KkYa7Nt0YMnM5oY2RPCm37IdVclo27X47k...,https://www.sevillafc.es/sites/default/files/j...,sevillafc.es,comply
4,.sevillafc.es,1.629454e+09,OPTAW_gaCookie_gid,/,,False,GA1.2.538998216.1629368093,https://www.sevillafc.es/sites/default/files/2...,sevillafc.es,comply
...,...,...,...,...,...,...,...,...,...,...
1070,.sevillafc.es,1.692440e+09,OPTAW_gaCookie,/,,False,GA1.2.108583041.1629368093,https://www.sevillafc.es/themes/da_vinci/image...,sevillafc.es,comply
1071,.sevillafc.es,1.629455e+09,OPTAW_gaCookie_gid,/,,False,GA1.2.538998216.1629368093,https://login.id.sevillafc.es/login/monitoring...,sevillafc.es,comply
1072,.sevillafc.es,1.629455e+09,OPTAW_gaCookie_gid,/,,False,GA1.2.538998216.1629368093,https://login.id.sevillafc.es/login/monitoring...,sevillafc.es,comply
1073,.sevillafc.es,1.629455e+09,OPTAW_gaCookie_gid,/,,False,GA1.2.538998216.1629368093,https://www.sevillafc.es/modules/contrib/langu...,sevillafc.es,comply
