In [1]:
from multiprocessing import Pool
from pathlib import Path

from tqdm import tqdm
import pandas as pd
from p_tqdm import p_umap

from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import FIG_DIR, get_scan_dirs, get_scan_root_dir
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_scans
from consent.data.pref_menu_scan.log_reader import read_logs_in_scans
from consent.data.pref_menu_scan.cookie_decl_reader import read_cookie_decls_in_scans
from consent.data.pref_menu_scan.postrej_cookie_reader import read_postrej_sent_cookies_in_scans
from consent.data.pref_menu_scan.cat_pref_reader import read_cat_prefs_in_dirs
from consent.util.default_path import get_data_dir
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain

# scan_root_dir = get_data_dir('2021-08-12')
# scan_dirs = [scan_root_dir / 'pref_menu_scan_0k_10k',
#              scan_root_dir / 'pref_menu_scan_10k_20k',
#              scan_root_dir / 'pref_menu_scan_20k_30k']
# assert all(scan_dir.exists() for scan_dir in scan_dirs)
location = 'de'
SCAN_DIRS = get_scan_dirs(location)
SCAN_ROOT_DIR = get_scan_root_dir(location)
fig_dir = Path.home() / 'local_projects/consent/paper/sp22/figures'
assert fig_dir.exists() and fig_dir.is_dir()

In [2]:
# Read in cookie declaration
# Number cookie declarations per websites
raw_cookie_prefs = read_cookie_prefs_in_scans(SCAN_DIRS)
raw_cookie_prefs.head()



Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,user,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
1,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
2,ajs_anonymous_id,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
3,_gclxxxx,wework.com,90 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
4,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True


In [3]:
# Read in cookie transfer 
# import sys; import importlib; importlib.reload(sys.modules['consent.data.pref_menu_scan.postrej_cookie_reader'])
from consent.data.pref_menu_scan.har_cookie_reader import read_postrej_sent_cookies_in_scans


overwrite = False
cookies_cache_file = SCAN_ROOT_DIR / 'scan.parquet'

if not overwrite and cookies_cache_file and cookies_cache_file.exists():
    sent_cookies = pd.read_parquet(cookies_cache_file)
else:
    sent_cookies = read_postrej_sent_cookies_in_scans(SCAN_DIRS)
    if cookies_cache_file: sent_cookies.to_parquet(cookies_cache_file); print(f"Written to {cookies_cache_file}")

print(f"Number sent cookies read: {len(sent_cookies):,d}")
sent_cookies.head(3)

Number sent cookies read: 13,055,212


Unnamed: 0,name,value,domain,path,expires,size,httpOnly,secure,session,sameSite,priority,sameParty,sourceScheme,sourcePort,request_url,site,page_url
0,ajs_anonymous_id,8b883f8e-dd49-4ffa-bc01-56e536e0e19c,.wework.com,/,1713082000.0,52,False,False,False,Lax,Medium,False,Secure,443,https://www.wework.com/,wework.com,https://www.wework.com/l/commercial-real-estat...
1,ajs_anonymous_id,8b883f8e-dd49-4ffa-bc01-56e536e0e19c,.wework.com,/,1713082000.0,52,False,False,False,Lax,Medium,False,Secure,443,https://www-static.wework.com/apercu/apercu.css,wework.com,https://www.wework.com/l/commercial-real-estat...
2,we_referring_domain,,www.wework.com,/,1682755000.0,19,False,False,False,,Medium,False,Secure,443,https://www.wework.com/l/commercial-real-estat...,wework.com,https://www.wework.com/l/commercial-real-estat...


In [4]:
# check duplicates in cookie preferences
cookie_prefs = raw_cookie_prefs.drop_duplicates()
print(f'There are {len(cookie_prefs):,d} unique_cookie_prefs ({len(raw_cookie_prefs):,d} preferences with duplicates)')
print(f"{len(set(sent_cookies.site) - set(cookie_prefs.site))} sites in sent cookies but do not have preferences.")

There are 678,040 unique_cookie_prefs (761,943 preferences with duplicates)
126 sites in sent cookies but do not have preferences.


In [5]:
# Sample a subset, from this point, use s_cookie_prefs
n_samples = len(cookie_prefs) # 100
sample_sites = cookie_prefs.sample(n_samples, random_state=1024).site.unique()
n_sites = cookie_prefs.site.nunique()
print(f"Num sample sites: {len(sample_sites):,d} {len(sample_sites) / n_sites*100:.2f}% of all {n_sites} sites.")

s_cookie_prefs = cookie_prefs[cookie_prefs.site.isin(sample_sites)].drop_duplicates()

Num sample sites: 4,785 100.00% of all 4785 sites.


In [6]:
# Map and detect ambiguity; note: prj = post-rejection, from this point, use prj_sent_cookies and prj_br_cookies
s_sent_cookies = sent_cookies[sent_cookies.site.isin(sample_sites)]
prj_sent_cookies = s_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
prj_br_cookies = prj_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()

In [7]:
print(f"Num captured sent cookies: {len(s_sent_cookies):,d}")
print(f"Num unique captured cookies: {len(prj_sent_cookies):,d}")
n_br_cookies = len(prj_br_cookies)
print(f"Num unique browser cookies: {n_br_cookies:,d} on {prj_br_cookies.site.nunique():,d} websites and {s_sent_cookies.page_url.nunique():,d} pages")

Num captured sent cookies: 12,730,436
Num unique captured cookies: 7,262,077
Num unique browser cookies: 242,007 on 4,760 websites and 26,391 pages


# Find number of cookie preferences that match a browser cookie.

In [8]:
# Select the strategy with the lowest ambiguity score.

import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

prefs = s_cookie_prefs[['name', 'domain', 'category_id', 'category', 'consent_mode', 'site']].drop_duplicates()
print(f"Num browser cookies: {len(prj_br_cookies):,d}, Num preferences: {len(prefs):,d}")

Num browser cookies: 242,007, Num preferences: 676,087


In [9]:
# How number of cookies changes when reducing the properties
print(len(prj_sent_cookies[['name', 'domain', 'path', 'sameSite', 'secure', 'site', 'expires']].drop_duplicates()))
print(len(prj_sent_cookies[['name', 'domain', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()))
print(len(prj_sent_cookies[['name', 'domain', 'path', 'site']].drop_duplicates()))
print(len(prj_sent_cookies[['name', 'domain', 'site']].drop_duplicates()))

242007
103157
102011
101581


In [10]:
prefs = prefs[['name', 'domain', 'site']].drop_duplicates()
prj_br_cookies = prj_br_cookies[['name', 'domain', 'path', 'site']].drop_duplicates()
print(f"After reduction and dropping duplicates, num browser cookies: {len(prj_br_cookies):,d}, Num preferences: {len(prefs):,d}")

After reduction and dropping duplicates, num browser cookies: 102,011, Num preferences: 672,848


In [11]:
sites = prefs.site.unique()
print(f"Number of sites: {len(sites):,d}")

Number of sites: 4,785


In [12]:
def get_cookie_pref_pairs(sites):
    for site in sites:
        site_prefs = prefs[prefs.site == site]
        site_cookies = prj_br_cookies[prj_br_cookies.site == site]
        yield (site_prefs.copy(), site_cookies.copy())

pref_cookie_pairs = list(get_cookie_pref_pairs(sites)) # about 20 secs

# Get pref->cookies

In [13]:

def get_n_cookie_matches(pref, cookies):
    n_matches = 0
    for cookie in cookies[(cookies.site == pref['site']) & (cookies.name == pref['name'])][['domain', 'name']].drop_duplicates().to_dict('records'):
        if cookie_pref_match(cookie, pref, pref['site']):
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
            n_matches += 1
    return n_matches

def get_site_cookie_matches(site_prefs, site_cookies):
    site_prefs['n_cookie_matches'] = site_prefs.apply(lambda row: get_n_cookie_matches(row, site_cookies), axis=1)
    return site_prefs

def run_get_n_cookie_matches(pref_cookie_pairs):
    results = p_umap(get_site_cookie_matches, *zip(*pref_cookie_pairs), num_cpus=32)  # 8-16: cannot interrupt. 48 takes 122s, 32-48 because no IO-blocking
    return pd.concat(results)


match_prefs = run_get_n_cookie_matches(pref_cookie_pairs)  # 21min with 1 core, 1 min with 8 cores

  0%|          | 0/4785 [00:00<?, ?it/s]

Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' bad character range e-a at position 22
Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' bad character range e-a at position 22
Error fuzzy name match pref_name='csrf[moorl.sign-in.get-url]' cookie_name='csrf[moorl.sign-in.get-url]' bad character range n-i at position 18
Error fuzzy name match pref_name='csrf[moorl.sign-in.get-url]' cookie_name='csrf[moorl.sign-in.get-url]' bad character range n-i at position 18


In [14]:
print("Distribution of preference -> n cookie matches:")
vc = match_prefs.n_cookie_matches.value_counts()
print(vc.to_latex())

Distribution of preference -> n cookie matches:
\begin{tabular}{lr}
\toprule
{} &  n\_cookie\_matches \\
\midrule
0 &            614170 \\
1 &             58333 \\
2 &               336 \\
3 &                 8 \\
4 &                 1 \\
\bottomrule
\end{tabular}



  print(vc.to_latex())


In [15]:
n_zero = vc[vc.index == 0].sum()
n_nempt = vc[vc.index > 0].sum()
n_multi = vc[vc.index > 1].sum()
n_uniq = vc[1]
total = vc.sum()
print(f"Number of missing mapping (1 decl -> 0 cookies): {n_zero/total*100:.2f}%")
print(f"Number of ambiguous mapping (1 decl -> cookie declarations): {n_multi=:,d} {n_uniq=:,d} {n_nempt=:,d} {total=:,d}: {n_multi/total*100:.2f}%")
print(f"Total cookies with mapping: {n_nempt:,d}")

Number of missing mapping (1 decl -> 0 cookies): 91.28%
Number of ambiguous mapping (1 decl -> cookie declarations): n_multi=345 n_uniq=58,333 n_nempt=58,678 total=672,848: 0.05%
Total cookies with mapping: 58,678


# Get cookie->pref

In [16]:
# Dask single machine 57 sec, 35 min single core, 6min min on 32 cores, progress bar not show progressively

# from dask.distributed import Client
# Client()
# prj_br_cookies_ddf = dd.from_pandas(prj_br_cookies, npartitions=32)
# x = prj_br_cookies_ddf.apply(lambda row: get_n_pref_matches(row, prefs), axis=1, meta=('n_matches', 'int64'))
# prj_br_cookies['n_pref_matches'] = x.compute() # convert to final result, instantly

def get_n_pref_matches(cookie, cookie_prefs):
    n_matches = 0
    for pref in cookie_prefs[(cookie_prefs.site == cookie['site']) & (cookie_prefs.name == cookie['name'])][['domain', 'name']].drop_duplicates().to_dict('records'):
        if cookie_pref_match(cookie, pref, cookie['site']):
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
            n_matches += 1
    return n_matches

def get_site_pref_matches(site_prefs, site_cookies):
    if len(site_cookies) == 0:
        return None

    site_cookies['n_pref_matches'] = site_cookies.apply(lambda row: get_n_pref_matches(row, site_prefs), axis=1)
    return site_cookies

def run_get_n_pref_matches(pref_cookie_pairs):
    # for site_prefs, site_cookies in pref_cookie_pairs: # get_site_pref_matches(site_prefs, site_cookies) # Debug
    results = p_umap(get_site_pref_matches, *zip(*pref_cookie_pairs), num_cpus=16)
    results = [r for r in results if r is not None]
    return pd.concat(results)

match_cookies = run_get_n_pref_matches(pref_cookie_pairs)

  0%|          | 0/4785 [00:00<?, ?it/s]

Error fuzzy name match pref_name='csrf[moorl.sign-in.get-url]' cookie_name='csrf[moorl.sign-in.get-url]'Error fuzzy name match pref_name='csrf[moorl.sign-in.get-url]' cookie_name='csrf[moorl.sign-in.get-url]' bad character range n-i at position 18
 bad character range n-i at position 18
Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' bad character range e-a at position 22
Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' bad character range e-a at position 22


In [17]:
vc = match_cookies.n_pref_matches.value_counts()
print(vc.to_latex())

\begin{tabular}{lr}
\toprule
{} &  n\_pref\_matches \\
\midrule
1 &           55929 \\
0 &           44417 \\
2 &            1663 \\
3 &               2 \\
\bottomrule
\end{tabular}



  print(vc.to_latex())


In [18]:
n_zeros = vc[vc.index == 0].sum()
n_nempt = vc[vc.index > 0].sum()
n_multi = vc[vc.index > 1].sum()
n_uniq = vc[1]
total = vc.sum()
print(f"Number of missing mapping (decl -> 0 cookie): {n_zeros/total*100:.2f}")
print(f"Number of ambiguous mapping (1 cookie -> multiple preferences): {n_multi=:,d} {n_uniq=:,d} {n_nempt=:,d} {total=:,d}: {n_multi/total*100:.2f}")
print(f"Total cookies with mapping: {n_nempt:,d}")

Number of missing mapping (decl -> 0 cookie): 43.54
Number of ambiguous mapping (1 cookie -> multiple preferences): n_multi=1,665 n_uniq=55,929 n_nempt=57,594 total=102,011: 1.63
Total cookies with mapping: 57,594


In [None]:
import sys; sys.exit(0)

# Case study of 1 cookie -> multiple preferences

In [21]:
match_cookies[match_cookies.n_pref_matches > 1].head(5)

Unnamed: 0,name,domain,path,site,n_pref_matches
316,RT,.exxonmobil.com,/,exxonmobil.com,2
87,fr,.facebook.com,/,hindawi.com,2
113,MUID,.bing.com,/,hindawi.com,2
6,__cflb,www.rentalcars.com,/,rentalcars.com,2
1980,IDE,.doubleclick.net,/,bankrate.com,2


In [22]:
n_dot_prefix_domains = 0
n_same_suffix_domains = 0
for mcookie in tqdm(match_cookies[match_cookies.n_pref_matches == 2].to_dict('records')):
    mprefs = prefs[(prefs.name == mcookie['name']) & (prefs.site == mcookie['site'])]
    mprefs = mprefs[mprefs.apply(lambda row: cookie_pref_match(mcookie, row, mcookie['site']), axis=1)]
    pr_domains = [d.lower() for d in mprefs.domain.to_list()]
    pr_domains = sorted(pr_domains, key=len)
    if pr_domains[1] == '.' + pr_domains[0]:
        n_dot_prefix_domains += 1
    if pr_domains[1].endswith(pr_domains[0]):
        n_same_suffix_domains += 1
print(f"Number of cases that domains differ only by the prefix dot: {n_dot_prefix_domains:,d}") 
print(f"Number of cases that domains are suffixes: {n_same_suffix_domains:,d}") 

100%|██████████| 888/888 [00:25<00:00, 35.38it/s]

Number of cases that domains differ only by the prefix dot: 445
Number of cases that domains are suffixes: 888





In [23]:
# Case study
mcookie  = match_cookies[match_cookies.n_pref_matches == 2].iloc[6]
prefs[(prefs.name == mcookie['name']) & (prefs.site == mcookie['site'])]

Unnamed: 0,name,domain,site
37,IDE,doubleclick.net,splunkcloud.com
57,IDE,.doubleclick.net,splunkcloud.com


In [24]:
# How many cookies that have same both domain and name?
# print("Num cookies preferences:", len(s_cookie_prefs))
# s_cookies = s_cookie_prefs[['domain', 'name', 'category', 'site']].drop_duplicates()
# print("Num unique cookie preferences:", len(s_cookies))

In [25]:
# plain pandas, 27-29 secs for sample(100)
# prj_br_cookies['n_matches'] = prj_br_cookies.apply(lambda row: get_n_matches(row, prefs), axis=1)

In [26]:
# s_cookie_prefs

# Analyze duplicate cookies

In [27]:
# s_cookies.value_counts()[:5]

In [28]:
# Case studies
# s_cookie_prefs[(s_cookie_prefs.domain == 'mc.yandex.ru') & (s_cookie_prefs.name == '_ym_uid')]
# s_cookie_prefs[(s_cookie_prefs.domain == 'bat.bing.com') & (s_cookie_prefs.name == '_uetsid')]
# s_cookie_prefs[(s_cookie_prefs.domain == 'start.stepchange.org') & (s_cookie_prefs.name == '_uetvid')]

In [29]:
def get_n_cookie_matches(cookies, pref):
    matched_cookies = set()
    for cookie in cookies.to_dict('records'):
        if cookie_pref_match(cookie, pref, cookie['site']):
            matched_cookies.add(hashabledict({'name': cookie['name'], 'domain': cookie['domain']}))
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
    return len(matched_cookies)

# s_cookie_prefs2 = s_cookie_prefs.copy()
# s_cookie_prefs2['n_matches'] = s_cookie_prefs2.apply(lambda pref: get_n_cookie_matches(prj_sent_cookies, pref), axis=1)
# s_cookie_prefs2.sort_values(by='n_matches', ascending=False)

In [30]:
# s_cookie_prefs2.n_matches.value_counts()

In [31]:
# prj_sent_cookies.head()

In [32]:
# s_cookie_prefs

In [33]:
# s_cookie_prefs = s_cookie_prefs[s_cookie_prefs.name == 'TAUnique']
# s_cookie_prefs 

In [34]:
# print('Num websites:', all_complies.site.nunique())
# all_complies.comply.unique()

In [35]:
# s_cookie_prefs

In [36]:
# Map intercepted cookies to browser cookies.
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

site_to_contras = {}  # TODO: make this to non-global one.

def check_in_set(site, acookie, cookie_pref_set, verbose=0):
    # check_url_host_match = relax_check_url_host_match # strict_check_url_host_match
    for cookie_pref in cookie_pref_set:
        if verbose >= 3:
            print(f'{cookie_pref=} {acookie=}')
        elif verbose >= 2:
            if cookie_pref['name'] == acookie['name']:
                print(f'{cookie_pref=} {acookie=}')
        
        if cookie_pref_match(acookie, cookie_pref, site):
            return True, cookie_pref
    return False, None

def get_comply_type(is_appr, is_rej):
    if is_appr and not is_rej:
        return 'comply'
    if not is_appr and is_rej:
        return 'incorrect'
    if not is_appr and not is_rej:
        return 'omit'
    return 'ambiguous'

def get_appr_rej_sets(prefs):
    """Return 2 sets: appr and rejection."""
    def get_hashable_cookie_set(df):
        cookies = df[['domain', 'name']].to_dict('records')
        return set(hashabledict(c) for c in cookies)
    
    appr_set = get_hashable_cookie_set(prefs[prefs.consent == True])
    rej_set = get_hashable_cookie_set(prefs[prefs.consent == False])
    assert len(prefs[~prefs.consent.isin([True, False])]) == 0
    
    contra_set = appr_set.intersection(rej_set)

    return appr_set, rej_set, contra_set

def update_appr_rej_pref(comply_result, appr_pref, rej_pref):
    name_to_pref = {'appr_pref': appr_pref, 'rej_pref': rej_pref}
    for name, pref in name_to_pref.items():
        for key in ['domain', 'name']:
            comply_result[name + '_' + key] = pref[key] if pref else None
    
def _get_comply_for_site(site, prefs, sent_cookies):
    appr_set, rej_set, contra_set = get_appr_rej_sets(prefs)
    comply_results = []
    for sent_cookie in sent_cookies:
        is_appr, appr_pref = check_in_set(site, sent_cookie, appr_set)
        is_rej, rej_pref = check_in_set(site, sent_cookie, rej_set)
        comply = get_comply_type(is_appr, is_rej)
        comply_result = sent_cookie.copy()
        update_appr_rej_pref(comply_result, appr_pref, rej_pref)
        # comply_result.update({'comply': comply, 'site': site})
        assert site == sent_cookie['site']
        comply_result['comply'] = comply # .update({'comply': comply, 'site': site})
        comply_results.append(comply_result)
    if len(contra_set) > 0:
        site_to_contras[site] = contra_set
        if len(site_to_contras) < 20: # Print some of the contra to see the progress only
            print(f'Contradictory set: {site=} {contra_set=}')    
    return comply_results

def get_comply_for_sites(args, sites, parallel=False):
    if parallel: # not work, maybe bottleneck is the transfer of a big data frame.
        pool = Pool(32)
        for result in pool.starmap(_get_comply_for_site, args):
            yield result
    else:
        for arg in tqdm(args, total=len(sites)):
            yield _get_comply_for_site(*arg)
            
def get_compute_args(sites, cookie_prefs, prj_sent_cookies):
    #     return [(site, cookie_prefs, prj_sent_cookies) for site in sites]
    for site in sites:
        site_cookie_prefs = cookie_prefs[cookie_prefs.site == site]
        site_prj_sent_cookies = prj_sent_cookies[prj_sent_cookies.site == site].to_dict('records')   
        yield site, site_cookie_prefs, site_prj_sent_cookies
            
def get_comply(cookie_prefs, prj_sent_cookies):
    sites = cookie_prefs.site.unique() # .tolist()
#     sites = ['suse.com', 'ulta.com', 'optimizely.com', 'cell.com']
    args = get_compute_args(sites, cookie_prefs, prj_sent_cookies)
    
    comply_results = []    
    for complies_for_site in get_comply_for_sites(args, sites, parallel=False):
        comply_results.extend(complies_for_site)
        
    return pd.DataFrame(comply_results)

print("faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:")
all_complies = get_comply(s_cookie_prefs, prj_sent_cookies)
# print("Number of incor"
all_complies[all_complies.comply == 'incorrect']

faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:


  0%|          | 4/1623 [00:01<09:28,  2.85it/s]

Contradictory set: site='genial.ly' contra_set={{'domain': 'app.genial.ly', 'name': '__stripe_sid'}, {'domain': 'app.genial.ly', 'name': '__stripe_mid'}}


  0%|          | 6/1623 [00:02<15:41,  1.72it/s]

Contradictory set: site='ulta.com' contra_set={{'domain': 'ulta.com', 'name': '_schn'}}


  2%|▏         | 33/1623 [00:16<14:05,  1.88it/s]

Contradictory set: site='piriform.com' contra_set={{'domain': 'ccleanercom-production-slave.azurewebsites.net', 'name': 'ARRAffinity'}}


  3%|▎         | 45/1623 [00:21<09:48,  2.68it/s]

Contradictory set: site='cell.com' contra_set={{'domain': 'hubspot.net', 'name': '__cfduid'}}


  4%|▍         | 69/1623 [00:33<35:35,  1.37s/it]

Contradictory set: site='checkout.com' contra_set={{'domain': 'static.cdn.prismic.io', 'name': 'io.prismic.preview'}}


  4%|▍         | 71/1623 [00:34<27:26,  1.06s/it]

Contradictory set: site='adobeconnect.com' contra_set={{'domain': 'app-sj16.marketo.com', 'name': '__cfduid'}, {'domain': 'app-aba.marketo.com', 'name': '__cf_bm'}}


  4%|▍         | 73/1623 [00:35<18:33,  1.39it/s]

Contradictory set: site='returnpath.net' contra_set={{'domain': 'app.jazz.co', 'name': 'SF_PHPSESSID'}, {'domain': 'community.validity.com', 'name': 'cookieTest'}}


  5%|▌         | 86/1623 [00:41<20:17,  1.26it/s]

Contradictory set: site='echosign.com' contra_set={{'domain': 'app-sj16.marketo.com', 'name': '__cfduid'}, {'domain': 'app-aba.marketo.com', 'name': '__cf_bm'}}


  5%|▌         | 88/1623 [00:41<15:42,  1.63it/s]

Contradictory set: site='aon.com' contra_set={{'domain': 'insights.humancapital.aon.com', 'name': 'ufentry'}, {'domain': 'sleeknotestaticcontent.sleeknote.com', 'name': 'SNS'}}


  6%|▌         | 91/1623 [00:42<11:06,  2.30it/s]

Contradictory set: site='balenciaga.com' contra_set={{'domain': 'www.balenciaga.com', 'name': '_cs_same_site'}}


  6%|▌         | 94/1623 [00:43<08:46,  2.90it/s]

Contradictory set: site='statista.com' contra_set={{'domain': 'm6r.eu', 'name': 'id'}}


  6%|▌         | 98/1623 [00:45<08:12,  3.10it/s]

Contradictory set: site='currys.co.uk' contra_set={{'domain': 'www.currys.co.uk', 'name': 'cookietest'}}


  6%|▌         | 100/1623 [00:47<23:29,  1.08it/s]

Contradictory set: site='accorhotels.com' contra_set={{'domain': 'all.accor.com', 'name': 'cookietest'}}


  6%|▋         | 102/1623 [00:49<24:51,  1.02it/s]

Contradictory set: site='meraki.com' contra_set={{'domain': 'cisco.com', 'name': '_vapi'}}


  6%|▋         | 104/1623 [00:52<36:30,  1.44s/it]

Contradictory set: site='magentocommerce.com' contra_set={{'domain': 'app-sj16.marketo.com', 'name': '__cfduid'}}


  7%|▋         | 108/1623 [01:01<50:43,  2.01s/it]

Contradictory set: site='vmware.com' contra_set={{'domain': 'communities.vmware.com', 'name': 'LithiumCookiesAccepted'}}


  7%|▋         | 109/1623 [01:01<39:49,  1.58s/it]

Contradictory set: site='ledger.com' contra_set={{'domain': 'shop.ledger.com', 'name': 'cookietest'}}


  7%|▋         | 110/1623 [01:02<30:29,  1.21s/it]

Contradictory set: site='frontiersin.org' contra_set={{'domain': 'zendesk.frontiersin.org', 'name': 'cf_chl_seq_xxxxxxxxxxxxxxx'}}


  7%|▋         | 118/1623 [01:07<22:59,  1.09it/s]

Contradictory set: site='demdex.net' contra_set={{'domain': 'app-sj16.marketo.com', 'name': '__cfduid'}, {'domain': 'app-aba.marketo.com', 'name': '__cf_bm'}}


100%|██████████| 1623/1623 [11:47<00:00,  2.29it/s]


Unnamed: 0,domain,expires,name,path,sameSite,secure,value,request_url,site,appr_pref_domain,appr_pref_name,rej_pref_domain,rej_pref_name,comply
1076,.helpshift.com,1.702659e+09,_ga,/,,False,GA1.2.1307472611.1639587332,https://cdn.helpshift.com/wp-content/plugins/t...,helpshift.com,,,helpshift.com,_ga,incorrect
1080,.helpshift.com,1.655355e+09,__utmz,/,,False,148167982.1639587333.1.1.utmcsr=(direct)|utmcc...,https://cdn.helpshift.com/wp-content/plugins/p...,helpshift.com,,,helpshift.com,__utmz,incorrect
1084,.helpshift.com,1.647363e+09,_fbp,/,,False,fb.1.1639587333019.107912994,https://cdn.helpshift.com/wp-includes/css/dist...,helpshift.com,,,helpshift.com,_fbp,incorrect
1086,.helpshift.com,1.702659e+09,_ga,/,,False,GA1.2.1307472611.1639587332,https://cdn.helpshift.com/wp-content/plugins/h...,helpshift.com,,,helpshift.com,_ga,incorrect
1094,.helpshift.com,1.702659e+09,__utma,/,,False,148167982.1307472611.1639587332.1639587333.163...,https://cdn.helpshift.com/wp-content/plugins/h...,helpshift.com,,,helpshift.com,__utma,incorrect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3059520,.vespa.com,1.702970e+09,_ga,/,,False,GA1.2.1432834110.1639897653,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,,,vespa.com,_ga,incorrect
3059521,.vespa.com,1.639984e+09,_gid,/,,False,GA1.2.457380286.1639897653,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,,,vespa.com,_gid,incorrect
3059522,.vespa.com,1.647674e+09,_fbp,/,,False,fb.1.1639897653364.492083713,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,,,vespa.com,_fbp,incorrect
3059524,.vespa.com,1.647674e+09,_gcl_au,/,,False,1.1.1311567854.1639897653,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,,,vespa.com,_gcl_au,incorrect


In [37]:
all_complies

Unnamed: 0,domain,expires,name,path,sameSite,secure,value,request_url,site,appr_pref_domain,appr_pref_name,rej_pref_domain,rej_pref_name,comply
0,.wework.com,1.671123e+09,ajs_user_id,/,Lax,False,%22e465a0f34b3bdaa686645b9f88599d5563d0e8562db...,https://www-static.wework.com/apercu/apercu_mo...,wework.com,wework.com,ajs_user_id,,,comply
1,.wework.com,1.702659e+09,_ga,/,,False,GA1.1.1271302899.1639586858,https://www.wework.com/vanilla-assets/javascri...,wework.com,wework.com,_ga,,,comply
2,.wework.com,1.671123e+09,ajs_user_id,/,Lax,False,%22e465a0f34b3bdaa686645b9f88599d5563d0e8562db...,https://www.wework.com/vanilla-assets/fonts/ic...,wework.com,wework.com,ajs_user_id,,,comply
3,www.wework.com,1.639601e+09,_gd_session,/,,True,7245fb13-c2cf-4c94-8be5-e2089d00f378,https://www.wework.com/vanilla-assets/javascri...,wework.com,,,,,omit
4,.krxd.net,1.655139e+09,_kuid_,/,,True,OiuKv-_v,https://cdn.krxd.net/userdata/get?pub=6667d5d3...,wework.com,krxd.net,_kuid_,,,comply
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3059522,.vespa.com,1.647674e+09,_fbp,/,,False,fb.1.1639897653364.492083713,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,,,vespa.com,_fbp,incorrect
3059523,.vespa.com,1.639901e+09,AKA_A2,/,,True,A,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,vespa.com,AKA_A2,,,comply
3059524,.vespa.com,1.647674e+09,_gcl_au,/,,False,1.1.1311567854.1639897653,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,,,vespa.com,_gcl_au,incorrect
3059525,.vespa.com,1.639898e+09,_gat_UA-66187049-5,/,,False,1,https://wlassets.vespa.com/wlassets/vespa/mast...,vespa.com,,,,,omit
