In [1]:
""" Analyze opt-out cookies. """
from collections import defaultdict, Counter
from multiprocessing import Pool
from pathlib import Path
from typing import Dict
import json
import re

from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


from consent.data.site_pref import read_site_prefs
from consent.data.postrej_cookie_reader import parallel_read_postrej_sent_cookies
from consent.util.default_path import get_data_dir
from ooutil.cookie_util import get_brower_cookies
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain


# data_dir = get_data_dir('2021-04-27/pref_menu_scan')
# data_dir = get_data_dir('2021-04-28/pref_menu_scan')
# data_dir = get_data_dir('2021-04-30/pref_menu_scan')
# data_dir = get_data_dir('2021-05-01/pref_menu_scan')
data_dir = get_data_dir('2021-05-03/pref_menu_scan_cleanstart')


# fig_dir = Path.home() / 'local_projects/consent/paper/sp22/figures'
# assert fig_dir.exists() and fig_dir.is_dir()

# # opt_cookies_file = get_data_dir('2021-04-29') / 'evidon_opt_cookies.tsv'
# # assert opt_cookies_file.exists()

In [2]:
scanned_sites = [site_dir.name for site_dir in data_dir.glob('*') if site_dir.is_dir()]
site_dirs = [data_dir / site for site in scanned_sites]
print("Number of scanned sites:", len(scanned_sites)) # , scanned_sites)
site_prefs = read_site_prefs(site_dirs)
site_prefs

Number of scanned sites: 227
Contain duplicates: Before 12937, after drop duplicates: 12927


Unnamed: 0,duration,host,name,group_id,site,consent
0,Session,healthgrades.com,_vapi,C0001,healthgrades.com,True
1,365 days,partners.healthgrades.com,OptanonConsent,C0001,healthgrades.com,True
2,a few seconds,healthgrades.com,_dc_gtm_UA-xxxxxxxx,C0001,healthgrades.com,True
3,2914169 days,partners.healthgrades.com,eupubconsent,C0001,healthgrades.com,True
4,365 days,healthgrades.com,OptanonAlertBoxClosed,C0001,healthgrades.com,True
...,...,...,...,...,...,...
12932,390 days,yieldoptimizer.com,ph,SPD_BG,bestwestern.com,False
12933,390 days,yieldoptimizer.com,dph,SPD_BG,bestwestern.com,False
12934,390 days,yieldoptimizer.com,gcma,SPD_BG,bestwestern.com,False
12935,390 days,yieldoptimizer.com,ckid,SPD_BG,bestwestern.com,False


In [3]:
cookielist_sites = sorted(site_prefs.site.unique().tolist())
n_cookielist_sites = len(cookielist_sites)
assert site_prefs.site.nunique() == n_cookielist_sites
print("Sites with cookie list:", n_cookielist_sites, cookielist_sites)

Sites with cookie list: 83 ['accorhotels.com', 'adtelligent.com', 'apachefriends.org', 'behance.net', 'bestwestern.com', 'biomedcentral.com', 'bitnami.com', 'bodybuilding.com', 'britishcouncil.org', 'callofduty.com', 'canon.com', 'cell.com', 'chanel.com', 'christies.com', 'cloudflare.com', 'cnet.com', 'commonsensemedia.org', 'corel.com', 'digicert.com', 'digitalspy.com', 'documentforce.com', 'download.com', 'elsevier.com', 'elsevierhealth.com', 'exacttarget.com', 'exoclick.com', 'fairmont.com', 'fendi.com', 'force.com', 'gamespot.com', 'glassdoor.co.in', 'glassdoor.com', 'goal.com', 'healthgrades.com', 'home.kpmg', 'hotjar.com', 'irishtimes.com', 'kpmg.com', 'magento.com', 'mango.com', 'marketo.com', 'mcdonalds.com', 'media.net', 'mendeley.com', 'metacritic.com', 'myportfolio.com', 'nature.com', 'netflix.com', 'news24.com', 'nflxvideo.net', 'oclc.org', 'onetrust.com', 'pardot.com', 'payoneer.com', 'pendo.io', 'rapidssl.com', 'redbull.com', 'rte.ie', 'salesforce.com', 'scientificamerica

In [7]:
raw_postrej_sent_cookies_file = data_dir / 'raw_postrej_sent_cookies.parquet'
if raw_postrej_sent_cookies_file.exists():
    raw_postrej_sent_cookies = pd.read_parquet(raw_postrej_sent_cookies_file)
else:
    # postrej_sent_cookies = read_postrej_sent_cookies(site_dirs[:4])
    raw_postrej_sent_cookies = parallel_read_postrej_sent_cookies(data_dir)
    raw_postrej_sent_cookies.to_parquet(raw_postrej_sent_cookies_file); print(f"Written to {raw_postrej_sent_cookies_file}")
postrej_sent_cookies = raw_postrej_sent_cookies.drop_duplicates()

raw_postrej_sent_cookies.head(3)

Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time
0,.siemens.com,1651627000.0,False,OptanonAlertBoxClosed,/,,True,2021-05-04T01:24:38.671Z,https://www.siemens.com/css/chunk-618b9f60.9ae...,siemens.com,1620091000.0,1620091000.0
1,.siemens.com,-1.0,False,s_cc,/,,False,true,https://www.siemens.com/js/chunk-b883de0a.ff4e...,siemens.com,1620091000.0,1620091000.0
2,.siemens.com,1651627000.0,False,OptanonAlertBoxClosed,/,,True,2021-05-04T01:24:38.671Z,https://www.siemens.com/js/chunk-0268bb8f.a48a...,siemens.com,1620091000.0,1620091000.0


In [8]:
print(f"Num raw captured cookies: {len(raw_postrej_sent_cookies)}")
print(f"Num unique captured cookies: {len(postrej_sent_cookies)}")

Num raw captured cookies: 958458
Num unique captured cookies: 958458


## Analyze opt-out cookies

In [9]:
# Remove opt-out cookies.raw_postrej_sent_cookies_file
# opt_cookies = pd.read_csv(opt_cookies_file, sep='\t')[['name', 'value']].to_dict('records')
# opt_cookies = set((cookie['name'], cookie['value']) for cookie in opt_cookies)
opt_cookies = json.loads((get_data_dir('2021-04-29') / 'combine_opt_cookies.json').read_text())
opt_cookies = set((cookie[0], cookie[1]) for cookie in opt_cookies)

def is_opt_cookie(row):
    return (row['name'], row['value']) in opt_cookies
optfil_cookies = postrej_sent_cookies[postrej_sent_cookies.apply(is_opt_cookie, axis=1) ]
print(f"Number of matched opt-out cookies: {len(optfil_cookies)}")

Number of matched opt-out cookies: 188


In [10]:
optfil_cookies[['domain', 'name', 'value', 'site']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,domain,name,value,site
0,.barnesandnoble.com,NoCookie,true,barnesandnoble.com
1,.quantserve.com,qoo,OPT_OUT,health.com


In [11]:
def predict(cookie):
    if cookie.get('expires') == -1:
        return False

    name, val = cookie['name'].lower(), cookie['value'].lower()
    exprs = [r"opt(ed)?[-_']?out", r"dnt", r"no_?track", r"_oo$", r"no_?cookie", r"deleted", "notarget"]
    # for keyword in ['optout', 'optedout', 'dnt', 'opt_out', 'opted_out', 'opt-out', 'opted-out']:
    for expr in exprs:
        if (re.search(expr, name) and val not in ['false', '0']) or re.match(expr, val):
            return True

    exprs = [r'_?(oo|opt)$']
    # for keyword in ['_oo', 'oo', '_opt']:
    for expr in exprs:
        for part in [name, val]:
            # if keyword == cookie[part].lower():
            if re.match(expr, part):
                return True

    exprs = [r'(_|t)?u?uid\d?', r'tdid']
    val_exprs = [r"^-1$", r"^nan$", r"^-2$", r"^[0\-]+$"]  # This is not unique.
    for expr in exprs:
        for val_expr in val_exprs:
            if re.search(expr, name) and re.match(val_expr, val):
            # if re.match(expr, name) and val in ['-1', 'nan']:
            # if name in ['uuid2', 'uuid', 'uid'] and val == '-1':
                return True

    exprs = [r'avnguid']
    for expr in exprs:
        if re.match(expr, name) and val in ['deleted']:
        # if name in ['uuid2', 'uuid', 'uid'] and val == '-1':
            return True

    for keyword in ['track']:
        if name == keyword and val == '0':
            return True

    return False

filtered_out = postrej_sent_cookies[postrej_sent_cookies.apply(predict, axis=1)]
filtered_out = filtered_out[['domain', 'name', 'value', 'site']].drop_duplicates().reset_index(drop=True).sort_values(by='site')


In [12]:
# Exclude first-party opt-out
def match_domain_site(row):
    domain, site = row['domain'], row['site']
    return domain.endswith(site)
third_filtered_out = filtered_out[ ~filtered_out.apply(match_domain_site, axis=1) ].reset_index(drop=True)
print(f"Number of detected opt-out cookie: {len(third_filtered_out)}")
third_filtered_out

Number of detected opt-out cookie: 18


Unnamed: 0,domain,name,value,site
0,.infolinks.com,R1USERCOOKIE,OPTOUT,abc7.com
1,.infolinks.com,R1USERCOOKIE,OPTOUT,breitbart.com
2,.servenobid.com,pid_321,OPTOUT,complex.com
3,.lijit.com,_ljtrtb_56,OPTOUT,dictionary.com
4,.salesforce.com,optimizelyOptOut,true,documentforce.com
5,www.ebsco.com,_an_uid,0,ebscohost.com
6,.salesforce.com,optimizelyOptOut,true,exacttarget.com
7,.fastly.com,mutiny.defaultOptOut,true,fastly.net
8,.quantserve.com,qoo,OPT_OUT,health.com
9,.media.net,data-r1,OPTOUT~~6,health.com


In [15]:
# TODO: automatically merge with exact match
print("Num opt-out cookie-using websites:", third_filtered_out.site.nunique())


Num opt-out cookie-using websites: 15


In [13]:
opt_cookies = json.loads((get_data_dir('2021-04-29') / 'combine_opt_cookies.json').read_text())
print("Number of sample opt-out cookies:", len(opt_cookies))
predicted = [predict({'name': opt_cookie[0], 'value': str(opt_cookie[1])}) for opt_cookie in opt_cookies]
print(f"Coverage of predict opt-out: {Counter(predicted)[True] / len(opt_cookies) * 100:.2f}")

print("Uncovered prediction:")
for opt_cookie in opt_cookies:
    if not predict({'name': opt_cookie[0], 'value': str(opt_cookie[1])}):
        print(opt_cookie)

Number of sample opt-out cookies: 217
Coverage of predict opt-out: 80.65
Uncovered prediction:
['JEB2', 'NOID']
['atdses', 'O']
['user_disabled_profile', '1']
['ymoo', 'true']
['CookieConsent', '-1']
['BKIgnore', '1']
['EROO', 'O']
['TapAd_TS', '0']
['opout', 'true']
['na_id', 'ignore']
['EVO5_OPT', '1']
['nout', '1']
['OT', '1']
['CMO', '2']
['DNP', '"1"']
['allowCookies', 'false']
['rooc', '1']
['_rlopt', 'out']
['out', '1']
['outcd', '1']
['wa', '!_dont_track_you']
['ad-privacy', '1']
['rud', 'h4siaaaaaaaaaopispypcin3dw0bagzwkcekaaaa']
['nuggstopp', 'true']
['disableClient', 'true']
['daa', '1']
['digitrust.v1.identity', 'eyjpzci6iiisinzlcnnpb24iojisinbyb2r1y2vyijoimunyc2rvtkfvniisimtlexyiojasinbyaxzhy3kionsib3b0b3v0ijp0cnvlfx0%3d']
['wt_cdbeid', '1']
['33x_nc', '33Across+Optout']
['vlybyid', 'ont']
['%2emookie1%2ecom/%2f/1/o', '0/cookie']
['Consent', '{"status":"OptOut","notified":[]}']
['catAccCookies', '1']
['flashtalkingad1', '"optout=1"']
['coo', '1']
['AJOO', '1']
['ENFORCE_PR