In [3]:
from multiprocessing import Pool
from pathlib import Path

from tqdm import tqdm
import pandas as pd
from p_tqdm import p_umap

from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import FIG_DIR, get_scan_dirs, get_scan_root_dir
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_scans
from consent.data.pref_menu_scan.postrej_cookie_reader import read_postrej_sent_cookies_in_scans
from ooutil.type_util import hashabledict

fig_dir = Path.home() / 'local_projects/consent/paper/sp22/figures'
assert fig_dir.exists() and fig_dir.is_dir()
overwrite = False
SCAN_DIRS = get_scan_dirs('eu')
SCAN_ROOT_DIR = get_scan_root_dir('eu')
output_suffix = '_' + '0k_200k' #'60k_80k' # '40k_60k' # # '100k_200k'; done: '20k_40k' '0k_20k' 
SCAN_DIRS 

[PosixPath('/mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/pref_menu_scan_0k_20k'),
 PosixPath('/mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/pref_menu_scan_20k_40k'),
 PosixPath('/mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/pref_menu_scan_40k_60k'),
 PosixPath('/mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/pref_menu_scan_60k_80k'),
 PosixPath('/mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/pref_menu_scan_80k_100k'),
 PosixPath('/mnt/sda/ducbui/Dropbox/Dropbox (University of Michigan)/projects/data_sync/consent/2022-05-30/pref_menu_scan_100k_200k')]

In [4]:
# Read in cookie declaration
# Number cookie declarations per websites
# raw_cookie_prefs = read_cookie_prefs_in_scans(SCAN_DIRS)
# raw_cookie_prefs.head()

# Get cookie prefs = cookie decls + prefs
cookie_prefs_file = SCAN_ROOT_DIR / f'cookie_prefs{output_suffix}.parquet'

if not cookie_prefs_file.exists() or overwrite:
    save_cookie_decls_file = SCAN_ROOT_DIR / f'cookie_decls{output_suffix}.parquet'
    raw_cookie_prefs = read_cookie_prefs_in_scans(SCAN_DIRS, save_cookie_decls_file)  # took 2.3min for top 50k-site scan
    raw_cookie_prefs.to_parquet(cookie_prefs_file); print(f'Written to {cookie_prefs_file}')
else:
    raw_cookie_prefs = pd.read_parquet(cookie_prefs_file)
raw_cookie_prefs.head() 
# TODO: add log_file_checker

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
1,OptanonAlertBoxClosed,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
2,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
3,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
4,we_referring_domain,www.wework.com,14 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True


In [5]:
# Read in cookie transfer 
# import sys; import importlib; importlib.reload(sys.modules['consent.data.pref_menu_scan.postrej_cookie_reader'])
from consent.data.pref_menu_scan.har_cookie_reader import read_postrej_sent_cookies_in_scans

cookies_cache_file = SCAN_ROOT_DIR / f'scan{output_suffix}.parquet'  # 'raw_postrej_sent_cookies.parquet'

if not cookies_cache_file.exists() or overwrite:
    sent_cookies = read_postrej_sent_cookies_in_scans(SCAN_DIRS)
    if cookies_cache_file: sent_cookies.to_parquet(cookies_cache_file); print(f"Written to {cookies_cache_file}")
else:
    sent_cookies = pd.read_parquet(cookies_cache_file)

print(f"Number sent cookies read: {len(sent_cookies):,d}")
sent_cookies.head(3)

Number sent cookies read: 13,458,638


Unnamed: 0,name,value,domain,path,expires,size,httpOnly,secure,session,sameSite,priority,sameParty,sourceScheme,sourcePort,request_url,site,page_url
0,OptanonAlertBoxClosed,2022-05-30T09:00:51.774Z,.www.wework.com,/,1685437000.0,45,False,False,False,Lax,Medium,False,Secure,443,https://www.wework.com/vanilla-assets/images/w...,wework.com,https://www.wework.com/l/commercial-real-estat...
1,ajs_user_id,%22a9fd262e2c727602f1b7107f2e97d8ea5fb32c1c434...,.wework.com,/,1685437000.0,81,False,False,False,Lax,Medium,False,Secure,443,https://www.wework.com/l/commercial-real-estat...,wework.com,https://www.wework.com/l/commercial-real-estat...
2,we_referring_domain,,www.wework.com,/,1655111000.0,19,False,False,False,,Medium,False,Secure,443,https://www.wework.com/vanilla-assets/javascri...,wework.com,https://www.wework.com/l/commercial-real-estat...


In [9]:
# check duplicates in cookie preferences
cookie_prefs = raw_cookie_prefs.copy() # .drop_duplicates()
print(f'There are {len(cookie_prefs):,d} unique_cookie_prefs ({len(raw_cookie_prefs):,d} preferences with duplicates)')
print(f"{len(set(sent_cookies.site) - set(cookie_prefs.site))} sites in sent cookies but do not have preferences.")

There are 826,525 unique_cookie_prefs (826,525 preferences with duplicates)
29 sites in sent cookies but do not have preferences.


In [10]:
# Sample a subset, from this point, use s_cookie_prefs
n_samples = len(cookie_prefs) # 100
sample_sites = cookie_prefs.sample(n_samples, random_state=1024).site.unique()
n_sites = cookie_prefs.site.nunique()
print(f"Num sample sites: {len(sample_sites):,d} {len(sample_sites) / n_sites*100:.2f}% of all {n_sites} sites.")

s_cookie_prefs = cookie_prefs[cookie_prefs.site.isin(sample_sites)].drop_duplicates()

Num sample sites: 6,095 100.00% of all 6095 sites.


In [11]:
# Map and detect ambiguity; note: prj = post-rejection, from this point, use prj_sent_cookies and prj_br_cookies
s_sent_cookies = sent_cookies[sent_cookies.site.isin(sample_sites)]
prj_sent_cookies = s_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'value', 'request_url', 'site']].drop_duplicates()
prj_br_cookies = prj_sent_cookies[['domain', 'expires', 'name', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()

In [12]:
print(f"Num captured sent cookies: {len(s_sent_cookies):,d}")
print(f"Num unique captured cookies: {len(prj_sent_cookies):,d}")
n_br_cookies = len(prj_br_cookies)
print(f"Num unique browser cookies: {n_br_cookies:,d} on {prj_br_cookies.site.nunique():,d} websites and {s_sent_cookies.page_url.nunique():,d} pages")

Num captured sent cookies: 13,394,436
Num unique captured cookies: 7,727,859
Num unique browser cookies: 274,731 on 6,057 websites and 32,237 pages


# Find number of cookie preferences that match a browser cookie.

In [13]:
# Select the strategy with the lowest ambiguity score.

import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

prefs = s_cookie_prefs[['name', 'domain', 'category_id', 'category', 'consent_mode', 'site']].drop_duplicates()
print(f"Num browser cookies: {len(prj_br_cookies):,d}, Num preferences: {len(prefs):,d}")

Num browser cookies: 274,731, Num preferences: 740,231


In [14]:
# How number of cookies changes when reducing the properties
print(len(prj_sent_cookies[['name', 'domain', 'path', 'sameSite', 'secure', 'site', 'expires']].drop_duplicates()))
print(len(prj_sent_cookies[['name', 'domain', 'path', 'sameSite', 'secure', 'site']].drop_duplicates()))
print(len(prj_sent_cookies[['name', 'domain', 'path', 'site']].drop_duplicates()))
print(len(prj_sent_cookies[['name', 'domain', 'site']].drop_duplicates()))

274731
126666
125107
124618


In [15]:
prefs = prefs[['name', 'domain', 'site']].drop_duplicates()
prj_br_cookies = prj_br_cookies[['name', 'domain', 'path', 'site']].drop_duplicates()
print(f"After reduction and dropping duplicates, num browser cookies: {len(prj_br_cookies):,d}, Num preferences: {len(prefs):,d}")

After reduction and dropping duplicates, num browser cookies: 125,107, Num preferences: 737,273


In [16]:
sites = prefs.site.unique()
print(f"Number of sites: {len(sites):,d}")

Number of sites: 6,095


In [17]:
def get_cookie_pref_pairs(sites):
    for site in sites:
        site_prefs = prefs[prefs.site == site]
        site_cookies = prj_br_cookies[prj_br_cookies.site == site]
        yield (site_prefs.copy(), site_cookies.copy())

pref_cookie_pairs = list(get_cookie_pref_pairs(sites)) # about 20 secs

# Get pref->cookies

In [28]:

def get_n_cookie_matches(pref, cookies):
    n_matches = 0
    for cookie in cookies[(cookies.site == pref['site']) & (cookies.name == pref['name'])][['domain', 'name']].drop_duplicates().to_dict('records'):
        if cookie_pref_match(cookie, pref, pref['site']):
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
            n_matches += 1
    return n_matches

def get_site_cookie_matches(site_prefs, site_cookies):
    site_prefs['n_cookie_matches'] = site_prefs.apply(lambda row: get_n_cookie_matches(row, site_cookies), axis=1)
    return site_prefs

def run_get_n_cookie_matches(pref_cookie_pairs):
    results = p_umap(get_site_cookie_matches, *zip(*pref_cookie_pairs), num_cpus=32)  # 8-16: cannot interrupt. 48 takes 122s, 32-48 because no IO-blocking
    return pd.concat(results)


match_prefs = run_get_n_cookie_matches(pref_cookie_pairs)  # 21min with 1 core, 1 min with 8 cores

  0%|          | 0/6095 [00:00<?, ?it/s]

Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' 
bad character range e-a at position 22bad character range e-a at position 22
Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' 

In [29]:
print("Distribution of preference -> n cookie matches:")
vc = match_prefs.n_cookie_matches.value_counts()
print(vc.to_latex())

Distribution of preference -> n cookie matches:
\begin{tabular}{lr}
\toprule
{} &  n\_cookie\_matches \\
\midrule
0 &            664826 \\
1 &             71305 \\
2 &              1129 \\
3 &                12 \\
4 &                 1 \\
\bottomrule
\end{tabular}



  print(vc.to_latex())


In [30]:
n_nempt = vc[vc.index > 0].sum()
n_multi = vc[vc.index > 1].sum()
n_uniq = vc[1]
print(f"Number of ambiguous mapping (1 pref -> many cookies): {n_multi=:,d} {n_uniq=:,d} {n_nempt=:,d}: {n_multi/n_nempt*100:.2f}%")
print(f"Total cookies with mapping: {n_nempt:,d}")

Number of ambiguous mapping (1 pref -> many cookies): n_multi=1,142 n_uniq=71,305 n_nempt=72,447: 1.58%
Total cookies with mapping: 72,447


# Get cookie->pref

In [31]:
# Dask single machine 57 sec, 35 min single core, 6min min on 32 cores, progress bar not show progressively

# from dask.distributed import Client
# Client()
# prj_br_cookies_ddf = dd.from_pandas(prj_br_cookies, npartitions=32)
# x = prj_br_cookies_ddf.apply(lambda row: get_n_pref_matches(row, prefs), axis=1, meta=('n_matches', 'int64'))
# prj_br_cookies['n_pref_matches'] = x.compute() # convert to final result, instantly

def get_n_pref_matches(cookie, cookie_prefs):
    n_matches = 0
    for pref in cookie_prefs[(cookie_prefs.site == cookie['site']) & (cookie_prefs.name == cookie['name'])][['domain', 'name']].drop_duplicates().to_dict('records'):
        if cookie_pref_match(cookie, pref, cookie['site']):
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
            n_matches += 1
    return n_matches

def get_site_pref_matches(site_prefs, site_cookies):
    if len(site_cookies) == 0:
        return None

    site_cookies['n_pref_matches'] = site_cookies.apply(lambda row: get_n_pref_matches(row, site_prefs), axis=1)
    return site_cookies

def run_get_n_pref_matches(pref_cookie_pairs):
    # for site_prefs, site_cookies in pref_cookie_pairs: # get_site_pref_matches(site_prefs, site_cookies) # Debug
    results = p_umap(get_site_pref_matches, *zip(*pref_cookie_pairs), num_cpus=16)
    results = [r for r in results if r is not None]
    return pd.concat(results)

match_cookies = run_get_n_pref_matches(pref_cookie_pairs)

  0%|          | 0/6095 [00:00<?, ?it/s]

Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' bad character range e-a at position 22
Error fuzzy name match pref_name='csrf[frontend.store-api.proxy]' cookie_name='csrf[frontend.store-api.proxy]' bad character range e-a at position 22


In [32]:
vc = match_cookies.n_pref_matches.value_counts()
print(vc.to_latex())

\begin{tabular}{lr}
\toprule
{} &  n\_pref\_matches \\
\midrule
1 &           67981 \\
0 &           54203 \\
2 &            2900 \\
4 &              16 \\
3 &               7 \\
\bottomrule
\end{tabular}



  print(vc.to_latex())


In [33]:
n_nempt = vc[vc.index > 0].sum()
n_multi = vc[vc.index > 1].sum()
n_uniq = vc[1]
print(f"Number of ambiguous mapping (1 cookie -> multiple preferences): {n_multi=:,d} {n_uniq=:,d} {n_nempt=:,d}: {n_multi/n_nempt*100:.2f}%")
print(f"Total cookies with mapping: {n_nempt:,d}")

Number of ambiguous mapping (1 cookie -> multiple preferences): n_multi=2,923 n_uniq=67,981 n_nempt=70,904: 4.12%
Total cookies with mapping: 70,904


In [34]:
raise

RuntimeError: No active exception to reraise

# Case study of 1 cookie -> multiple preferences

In [None]:
match_cookies[match_cookies.n_pref_matches > 1].head(5)

Unnamed: 0,name,domain,path,site,n_pref_matches
1,RT,.exxonmobil.com,/,exxonmobil.com,2
214,lang,.ads.linkedin.com,/,exxonmobil.com,2
249,test_cookie,.doubleclick.net,/,namecheap.com,2
331,MUID,.bing.com,/,namecheap.com,2
411,lang,.ads.linkedin.com,/,namecheap.com,2


In [None]:
n_dot_prefix_domains = 0
n_same_suffix_domains = 0
for mcookie in tqdm(match_cookies[match_cookies.n_pref_matches == 2].to_dict('records')):
    mprefs = prefs[(prefs.name == mcookie['name']) & (prefs.site == mcookie['site'])]
    mprefs = mprefs[mprefs.apply(lambda row: cookie_pref_match(mcookie, row, mcookie['site']), axis=1)]
    pr_domains = [d.lower() for d in mprefs.domain.to_list()]
    pr_domains = sorted(pr_domains, key=len)
    if pr_domains[1] == '.' + pr_domains[0]:
        n_dot_prefix_domains += 1
    if pr_domains[1].endswith(pr_domains[0]):
        n_same_suffix_domains += 1
print(f"Number of cases that domains differ only by the prefix dot: {n_dot_prefix_domains:,d}") 
print(f"Number of cases that domains are suffixes: {n_same_suffix_domains:,d}") 

100%|██████████| 1149/1149 [00:52<00:00, 21.71it/s]

Number of cases that domains differ only by the prefix dot: 363
Number of cases that domains are suffixes: 1,145





In [None]:
# Case study
mcookie  = match_cookies[match_cookies.n_pref_matches == 2].iloc[6]
prefs[(prefs.name == mcookie['name']) & (prefs.site == mcookie['site'])]

Unnamed: 0,name,domain,site
7,lidc,.linkedin.com,namecheap.com
34,lidc,linkedin.com,namecheap.com


In [None]:
# How many cookies that have same both domain and name?
# print("Num cookies preferences:", len(s_cookie_prefs))
# s_cookies = s_cookie_prefs[['domain', 'name', 'category', 'site']].drop_duplicates()
# print("Num unique cookie preferences:", len(s_cookies))

In [None]:
# plain pandas, 27-29 secs for sample(100)
# prj_br_cookies['n_matches'] = prj_br_cookies.apply(lambda row: get_n_matches(row, prefs), axis=1)

In [None]:
# s_cookie_prefs

# Analyze duplicate cookies

In [None]:
# s_cookies.value_counts()[:5]

In [None]:
# Case studies
# s_cookie_prefs[(s_cookie_prefs.domain == 'mc.yandex.ru') & (s_cookie_prefs.name == '_ym_uid')]
# s_cookie_prefs[(s_cookie_prefs.domain == 'bat.bing.com') & (s_cookie_prefs.name == '_uetsid')]
# s_cookie_prefs[(s_cookie_prefs.domain == 'start.stepchange.org') & (s_cookie_prefs.name == '_uetvid')]

In [None]:
def get_n_cookie_matches(cookies, pref):
    matched_cookies = set()
    for cookie in cookies.to_dict('records'):
        if cookie_pref_match(cookie, pref, cookie['site']):
            matched_cookies.add(hashabledict({'name': cookie['name'], 'domain': cookie['domain']}))
        # if cookie['domain'] == pref['domain']:
        # if cookie['domain'] == '.' + pref['domain']:
    return len(matched_cookies)

# s_cookie_prefs2 = s_cookie_prefs.copy()
# s_cookie_prefs2['n_matches'] = s_cookie_prefs2.apply(lambda pref: get_n_cookie_matches(prj_sent_cookies, pref), axis=1)
# s_cookie_prefs2.sort_values(by='n_matches', ascending=False)

In [None]:
# s_cookie_prefs2.n_matches.value_counts()

In [None]:
# prj_sent_cookies.head()

In [None]:
# s_cookie_prefs

In [None]:
# s_cookie_prefs = s_cookie_prefs[s_cookie_prefs.name == 'TAUnique']
# s_cookie_prefs 

In [None]:
# print('Num websites:', all_complies.site.nunique())
# all_complies.comply.unique()

In [None]:
# s_cookie_prefs

In [None]:
# Map intercepted cookies to browser cookies.
import sys; import importlib; importlib.reload(sys.modules['consent.consistency.cookie_pref_match'])
from consent.consistency.cookie_pref_match import cookie_pref_match

site_to_contras = {}  # TODO: make this to non-global one.

def check_in_set(site, acookie, cookie_pref_set, verbose=0):
    # check_url_host_match = relax_check_url_host_match # strict_check_url_host_match
    for cookie_pref in cookie_pref_set:
        if verbose >= 3:
            print(f'{cookie_pref=} {acookie=}')
        elif verbose >= 2:
            if cookie_pref['name'] == acookie['name']:
                print(f'{cookie_pref=} {acookie=}')
        
        if cookie_pref_match(acookie, cookie_pref, site):
            return True, cookie_pref
    return False, None

def get_comply_type(is_appr, is_rej):
    if is_appr and not is_rej:
        return 'comply'
    if not is_appr and is_rej:
        return 'incorrect'
    if not is_appr and not is_rej:
        return 'omit'
    return 'ambiguous'

def get_appr_rej_sets(prefs):
    """Return 2 sets: appr and rejection."""
    def get_hashable_cookie_set(df):
        cookies = df[['domain', 'name']].to_dict('records')
        return set(hashabledict(c) for c in cookies)
    
    appr_set = get_hashable_cookie_set(prefs[prefs.consent == True])
    rej_set = get_hashable_cookie_set(prefs[prefs.consent == False])
    assert len(prefs[~prefs.consent.isin([True, False])]) == 0
    
    contra_set = appr_set.intersection(rej_set)

    return appr_set, rej_set, contra_set

def update_appr_rej_pref(comply_result, appr_pref, rej_pref):
    name_to_pref = {'appr_pref': appr_pref, 'rej_pref': rej_pref}
    for name, pref in name_to_pref.items():
        for key in ['domain', 'name']:
            comply_result[name + '_' + key] = pref[key] if pref else None
    
def _get_comply_for_site(site, prefs, sent_cookies):
    appr_set, rej_set, contra_set = get_appr_rej_sets(prefs)
    comply_results = []
    for sent_cookie in sent_cookies:
        is_appr, appr_pref = check_in_set(site, sent_cookie, appr_set)
        is_rej, rej_pref = check_in_set(site, sent_cookie, rej_set)
        comply = get_comply_type(is_appr, is_rej)
        comply_result = sent_cookie.copy()
        update_appr_rej_pref(comply_result, appr_pref, rej_pref)
        # comply_result.update({'comply': comply, 'site': site})
        assert site == sent_cookie['site']
        comply_result['comply'] = comply # .update({'comply': comply, 'site': site})
        comply_results.append(comply_result)
    if len(contra_set) > 0:
        site_to_contras[site] = contra_set
        if len(site_to_contras) < 20: # Print some of the contra to see the progress only
            print(f'Contradictory set: {site=} {contra_set=}')    
    return comply_results

def get_comply_for_sites(args, sites, parallel=False):
    if parallel: # not work, maybe bottleneck is the transfer of a big data frame.
        pool = Pool(32)
        for result in pool.starmap(_get_comply_for_site, args):
            yield result
    else:
        for arg in tqdm(args, total=len(sites)):
            yield _get_comply_for_site(*arg)
            
def get_compute_args(sites, cookie_prefs, prj_sent_cookies):
    #     return [(site, cookie_prefs, prj_sent_cookies) for site in sites]
    for site in sites:
        site_cookie_prefs = cookie_prefs[cookie_prefs.site == site]
        site_prj_sent_cookies = prj_sent_cookies[prj_sent_cookies.site == site].to_dict('records')   
        yield site, site_cookie_prefs, site_prj_sent_cookies
            
def get_comply(cookie_prefs, prj_sent_cookies):
    sites = cookie_prefs.site.unique() # .tolist()
#     sites = ['suse.com', 'ulta.com', 'optimizely.com', 'cell.com']
    args = get_compute_args(sites, cookie_prefs, prj_sent_cookies)
    
    comply_results = []    
    for complies_for_site in get_comply_for_sites(args, sites, parallel=False):
        comply_results.extend(complies_for_site)
        
    return pd.DataFrame(comply_results)

print("faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:")
all_complies = get_comply(s_cookie_prefs, prj_sent_cookies)
# print("Number of incor"
all_complies[all_complies.comply == 'incorrect']

faster: for cookie_pref in cookie_pref_set[cookie_pref_set.name == acookie['name']]:


  0%|          | 5/1946 [00:02<13:33,  2.39it/s]

Contradictory set: site='genial.ly' contra_set={{'domain': 'app.genial.ly', 'name': '__stripe_mid'}, {'domain': 'app.genial.ly', 'name': '__stripe_sid'}}


  0%|          | 7/1946 [00:02<14:09,  2.28it/s]

Contradictory set: site='prweb.com' contra_set={{'domain': 'service.prweb.com', 'name': 'driftt_aid'}}


  1%|          | 12/1946 [00:05<19:12,  1.68it/s]

Contradictory set: site='ulta.com' contra_set={{'domain': 'ulta.com', 'name': '_schn'}}


  2%|▏         | 38/1946 [00:17<14:40,  2.17it/s]

Contradictory set: site='suse.com' contra_set={{'domain': 'buy.suse.com', 'name': 'AWSELB'}, {'domain': 'scc.suse.com', 'name': '_glue_session'}}


  2%|▏         | 40/1946 [00:18<12:32,  2.53it/s]

Contradictory set: site='worldoftanks.eu' contra_set={{'domain': 'worldoftanks.eu', 'name': 'hlauth'}}


  2%|▏         | 47/1946 [00:21<14:45,  2.14it/s]

Contradictory set: site='piriform.com' contra_set={{'domain': 'ccleanercom-production-slave.azurewebsites.net', 'name': 'ARRAffinity'}}


  3%|▎         | 56/1946 [00:25<10:55,  2.88it/s]

Contradictory set: site='elledecor.com' contra_set={{'domain': 'www.elledecor.com', 'name': 'location_data'}, {'domain': 'www.elledecor.com', 'name': '_glimmerCookieTest'}}


  3%|▎         | 62/1946 [00:27<10:22,  3.03it/s]

Contradictory set: site='cell.com' contra_set={{'domain': 'hubspot.net', 'name': '__cfduid'}}


  3%|▎         | 68/1946 [00:29<11:31,  2.72it/s]

Contradictory set: site='invisionapp.com' contra_set={{'domain': 'invisionapp.com', 'name': '__tld__'}}


  4%|▎         | 72/1946 [00:32<13:39,  2.29it/s]

Contradictory set: site='omaze.com' contra_set={{'domain': 'www.omaze.com', 'name': 'cookietest'}}


  5%|▌         | 99/1946 [00:47<22:23,  1.37it/s]

Contradictory set: site='checkout.com' contra_set={{'domain': 'static.cdn.prismic.io', 'name': 'io.prismic.preview'}}


  5%|▌         | 104/1946 [00:51<24:31,  1.25it/s]

Contradictory set: site='adobeconnect.com' contra_set={{'domain': 'app-sj16.marketo.com', 'name': '__cfduid'}, {'domain': 'app-aba.marketo.com', 'name': '__cf_bm'}}


  5%|▌         | 106/1946 [00:51<17:19,  1.77it/s]

Contradictory set: site='returnpath.net' contra_set={{'domain': 'community.validity.com', 'name': 'cookieTest'}, {'domain': 'app.jazz.co', 'name': 'SF_PHPSESSID'}}


  6%|▌         | 109/1946 [00:52<13:36,  2.25it/s]

Contradictory set: site='zendesk.com' contra_set={{'domain': 'www.zendesk.com', 'name': 'AWSALB'}}


  6%|▌         | 119/1946 [03:55<27:26:10, 54.06s/it]

Contradictory set: site='petsmart.com' contra_set={{'domain': 'www.petsmart.com', 'name': 'inptime0_13355_en'}}


  6%|▋         | 122/1946 [03:57<9:41:08, 19.12s/it] 

Contradictory set: site='echosign.com' contra_set={{'domain': 'app-sj16.marketo.com', 'name': '__cfduid'}, {'domain': 'app-aba.marketo.com', 'name': '__cf_bm'}}


  6%|▋         | 124/1946 [03:58<4:51:52,  9.61s/it]

Contradictory set: site='aon.com' contra_set={{'domain': 'insights.humancapital.aon.com', 'name': 'ufentry'}, {'domain': 'brightcove.com', 'name': '_bc_uuid'}}


  7%|▋         | 134/1946 [04:01<17:58,  1.68it/s]  

Contradictory set: site='statista.com' contra_set={{'domain': 'm6r.eu', 'name': 'id'}}


  7%|▋         | 137/1946 [04:02<13:13,  2.28it/s]

Contradictory set: site='name.com' contra_set={{'domain': 'cs.name.com', 'name': '__cfduid'}}


100%|██████████| 1946/1946 [18:00<00:00,  1.80it/s]


Unnamed: 0,domain,expires,name,path,sameSite,secure,value,request_url,site,appr_pref_domain,appr_pref_name,rej_pref_domain,rej_pref_name,comply
1,.wework.com,1.671518e+09,ajs_anonymous_id,/,Lax,False,%22ef8417d2-09a6-41af-be22-e46a6e0ea92d%22,https://www.wework.com/vanilla-assets/fonts/ic...,wework.com,,,wework.com,ajs_anonymous_id,incorrect
2,.wework.com,1.671518e+09,ajs_user_id,/,Lax,False,%22b22ada9465f804d5ec485afec1845a7c8078ec584f5...,https://www.wework.com/vanilla-assets/fonts/ic...,wework.com,,,wework.com,ajs_user_id,incorrect
4,.wework.com,1.671518e+09,ajs_user_id,/,Lax,False,%22b22ada9465f804d5ec485afec1845a7c8078ec584f5...,https://www.wework.com/vanilla-assets/images/w...,wework.com,,,wework.com,ajs_user_id,incorrect
18,.wework.com,1.671518e+09,ajs_anonymous_id,/,Lax,False,%22ef8417d2-09a6-41af-be22-e46a6e0ea92d%22,https://www.wework.com/vanilla-assets/javascri...,wework.com,,,wework.com,ajs_anonymous_id,incorrect
21,.wework.com,1.671518e+09,ajs_anonymous_id,/,Lax,False,%22ef8417d2-09a6-41af-be22-e46a6e0ea92d%22,https://www.wework.com/vanilla-assets/javascri...,wework.com,,,wework.com,ajs_anonymous_id,incorrect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2968878,.sentinelone.com,1.673722e+09,_uetvid,/,,False,3a1c97c061c511ec80edd78f4b9f5881,https://go.sentinelone.com/index.php/form/getF...,sentinelone.com,,,sentinelone.com,_uetvid,incorrect
2968879,.sentinelone.com,1.640112e+09,_gid,/,,False,GA1.2.1958490450.1640026025,https://www.sentinelone.com/global-services/su...,sentinelone.com,,,sentinelone.com,_gid,incorrect
2968882,.sentinelone.com,1.703098e+09,_ga,/,,False,GA1.2.1830428201.1640026025,https://www.sentinelone.com/wp-content/themes/...,sentinelone.com,,,sentinelone.com,_ga,incorrect
2968883,.sentinelone.com,1.640112e+09,_uetsid,/,,False,3a1c7ac061c511ec900301a8f58da293,https://www.sentinelone.com/wp-content/themes/...,sentinelone.com,,,sentinelone.com,_uetsid,incorrect


In [None]:
all_complies

Unnamed: 0,domain,expires,name,path,sameSite,secure,value,request_url,site,appr_pref_domain,appr_pref_name,rej_pref_domain,rej_pref_name,comply
0,www.wework.com,1.671518e+09,__we_bucket_id,/,Lax,True,b22ada9465f804d5ec485afec1845a7c8078ec584f59b9...,https://www.wework.com/vanilla-assets/javascri...,wework.com,www.wework.com,__we_bucket_id,,,comply
1,.wework.com,1.671518e+09,ajs_anonymous_id,/,Lax,False,%22ef8417d2-09a6-41af-be22-e46a6e0ea92d%22,https://www.wework.com/vanilla-assets/fonts/ic...,wework.com,,,wework.com,ajs_anonymous_id,incorrect
2,.wework.com,1.671518e+09,ajs_user_id,/,Lax,False,%22b22ada9465f804d5ec485afec1845a7c8078ec584f5...,https://www.wework.com/vanilla-assets/fonts/ic...,wework.com,,,wework.com,ajs_user_id,incorrect
3,www.wework.com,1.641192e+09,we_referring_domain,/,,False,,https://www.wework.com/vanilla-assets/fonts/ic...,wework.com,www.wework.com,we_referring_domain,,,comply
4,.wework.com,1.671518e+09,ajs_user_id,/,Lax,False,%22b22ada9465f804d5ec485afec1845a7c8078ec584f5...,https://www.wework.com/vanilla-assets/images/w...,wework.com,,,wework.com,ajs_user_id,incorrect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2968887,.sentinelone.com,1.647802e+09,_gcl_au,/,,False,1.1.389756944.1640026024,https://www.sentinelone.com/wp-content/themes/...,sentinelone.com,,,,,omit
2968888,www.sentinelone.com,1.640027e+09,_omappvs,/,Lax,True,1640026095614,https://www.sentinelone.com/wp-content/themes/...,sentinelone.com,,,,,omit
2968889,www.sentinelone.com,1.985540e+09,_omappvp,/,Lax,True,NwHAA1fbfuJgbuvESfogOlN0xA29aMIh18e6I71aFb6pbU...,https://www.sentinelone.com/wp-content/themes/...,sentinelone.com,,,,,omit
2968890,.6sc.co,1.703098e+09,6suuid,/,,True,5e3a5b68fe100000a8cfc06177000000de442900,https://b.6sc.co/v1/beacon/img.gif?token=3576c...,sentinelone.com,,,,,omit
