In [1]:
from multiprocessing import Pool
from pathlib import Path

from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import dask.dataframe as dd

from consent.consistency.cookie_pref_match import cookie_pref_match
from consent.consistency.util import SCAN_DIRS, FIG_DIR, SCAN_ROOT_DIR
from consent.data.pref_menu_scan.cookie_pref_reader import read_cookie_prefs_in_scans
from consent.data.pref_menu_scan.log_reader import read_logs_in_scans
from consent.data.pref_menu_scan.cookie_decl_reader import read_cookie_decls_in_scans
from consent.data.pref_menu_scan.postrej_cookie_reader import read_postrej_sent_cookies_in_scans
from consent.data.pref_menu_scan.cat_pref_reader import read_cat_prefs_in_dirs
from consent.util.default_path import get_data_dir
from ooutil.type_util import hashabledict
from ooutil.url_util import get_suffixed_domain

scan_root_dir = get_data_dir('2021-08-12')
scan_dirs = [scan_root_dir / 'pref_menu_scan_0k_10k',
             scan_root_dir / 'pref_menu_scan_10k_20k',
             scan_root_dir / 'pref_menu_scan_20k_30k']
assert all(scan_dir.exists() for scan_dir in scan_dirs)

fig_dir = Path.home() / 'local_projects/consent/paper/sp22/figures'
assert fig_dir.exists() and fig_dir.is_dir()

In [2]:
# Read in cookie transfer 
import sys; import importlib; importlib.reload(sys.modules['consent.data.pref_menu_scan.postrej_cookie_reader'])
from consent.data.pref_menu_scan.postrej_cookie_reader import read_postrej_sent_cookies_in_scans


overwrite = False
cookies_cache_file = SCAN_ROOT_DIR / 'raw_postrej_sent_cookies.parquet'

if not overwrite and cookies_cache_file and cookies_cache_file.exists():
    sent_cookies = pd.read_parquet(cookies_cache_file)
else:
    sent_cookies = read_postrej_sent_cookies_in_scans(SCAN_DIRS)
    if cookies_cache_file: sent_cookies.to_parquet(cookies_cache_file); print(f"Written to {cookies_cache_file}")

print(f"Number sent cookies read: {len(sent_cookies):,d}")
sent_cookies.head(3)

Number sent cookies read: 5,196,675


Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time,page_url
0,.apachefriends.org,1629067000.0,False,_gid,/,,False,GA1.2.720720591.1628980892,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
1,.apachefriends.org,1660517000.0,False,OptanonAlertBoxClosed,/,Lax,False,2021-08-14T22:42:17.927Z,https://www.apachefriends.org/javascripts/all-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
2,.apachefriends.org,1660517000.0,False,OptanonConsent,/,Lax,False,isIABGlobal=false&datestamp=Sat+Aug+14+2021+22...,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html


In [7]:
# Just test drop_duplicates()
samples = sent_cookies[:1000]
samples.drop_duplicates().head()

Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time,page_url
0,.apachefriends.org,1629067000.0,False,_gid,/,,False,GA1.2.720720591.1628980892,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
1,.apachefriends.org,1660517000.0,False,OptanonAlertBoxClosed,/,Lax,False,2021-08-14T22:42:17.927Z,https://www.apachefriends.org/javascripts/all-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
2,.apachefriends.org,1660517000.0,False,OptanonConsent,/,Lax,False,isIABGlobal=false&datestamp=Sat+Aug+14+2021+22...,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
3,.apachefriends.org,1629067000.0,False,_gid,/,,False,GA1.2.720720591.1628980892,https://www.apachefriends.org/images/flags/hu-...,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html
4,.apachefriends.org,1660517000.0,False,OptanonAlertBoxClosed,/,Lax,False,2021-08-14T22:42:17.927Z,https://www.apachefriends.org/index.html,apachefriends.org,1628981000.0,1628981000.0,https://www.apachefriends.org/index.html


In [17]:
samples_ddf = dd.from_pandas(samples.dropna(), npartitions=32)
samples_ddf

Unnamed: 0_level_0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time,page_url
npartitions=32,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,object,float64,bool,object,object,object,bool,object,object,object,float64,float64,object
32,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,...,...,...,...,...,...,...,...,...,...,...,...,...
999,...,...,...,...,...,...,...,...,...,...,...,...,...


In [22]:
def afunc(row):
    return 1 if 'apache' in row['domain'] else 0
samples['count'] = samples_ddf.apply(afunc, axis=1, meta=('x', 'int64')).compute() 
samples

Unnamed: 0,domain,expires,httpOnly,name,path,sameSite,secure,value,request_url,site,load_start_time,load_end_time,page_url,count
0,.apachefriends.org,1.629067e+09,False,_gid,/,,False,GA1.2.720720591.1628980892,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1.628981e+09,1.628981e+09,https://www.apachefriends.org/index.html,1
1,.apachefriends.org,1.660517e+09,False,OptanonAlertBoxClosed,/,Lax,False,2021-08-14T22:42:17.927Z,https://www.apachefriends.org/javascripts/all-...,apachefriends.org,1.628981e+09,1.628981e+09,https://www.apachefriends.org/index.html,1
2,.apachefriends.org,1.660517e+09,False,OptanonConsent,/,Lax,False,isIABGlobal=false&datestamp=Sat+Aug+14+2021+22...,https://www.apachefriends.org/images/flags/fr-...,apachefriends.org,1.628981e+09,1.628981e+09,https://www.apachefriends.org/index.html,1
3,.apachefriends.org,1.629067e+09,False,_gid,/,,False,GA1.2.720720591.1628980892,https://www.apachefriends.org/images/flags/hu-...,apachefriends.org,1.628981e+09,1.628981e+09,https://www.apachefriends.org/index.html,1
4,.apachefriends.org,1.660517e+09,False,OptanonAlertBoxClosed,/,Lax,False,2021-08-14T22:42:17.927Z,https://www.apachefriends.org/index.html,apachefriends.org,1.628981e+09,1.628981e+09,https://www.apachefriends.org/index.html,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,.appdynamics.com,1.691883e+09,False,AMCV_B8D07FF4520E94C10A490D4C%40AdobeOrg,/,,False,281789898%7CMCIDTS%7C18852%7CMCMID%7C185840794...,https://www.appdynamics.com/c/r/appdynamics/pr...,appdynamics.com,1.628811e+09,1.628811e+09,https://www.appdynamics.com/product/end-user-m...,0
996,.appdynamics.com,1.628813e+09,False,_cs_mk,/,,False,0.648266833031006_1628811382587,https://www.appdynamics.com/etc/designs/cdc/fw...,appdynamics.com,1.628811e+09,1.628811e+09,https://www.appdynamics.com/product/end-user-m...,0
997,.appdynamics.com,-1.000000e+00,False,s_ptc,/,,False,%5B%5BB%5D%5D,https://www.appdynamics.com/etc/designs/r/appd...,appdynamics.com,1.628811e+09,1.628811e+09,https://www.appdynamics.com/product/end-user-m...,0
998,www.appdynamics.com,1.691883e+09,False,drift_aid,/,,True,98ccb210-ab51-4d1d-8def-d57f3591e4d9,https://www.appdynamics.com/etc/designs/r/appd...,appdynamics.com,1.628811e+09,1.628811e+09,https://www.appdynamics.com/product/end-user-m...,0
