In [1]:
from collections import defaultdict

from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from consent.consistency.util import get_scan_root_dir, get_scan_dirs, LOCATIONS
from ooutil.df_util import read_data_files
from ooutil.url_util import get_suffixed_domain

pd.options.display.max_rows = 100

def process(location):
    print(f"Process location {location}")
    SCAN_DIRS = get_scan_dirs(location)
    SCAN_ROOT_DIR = get_scan_root_dir(location)

    cookie_prefs_files = list(SCAN_ROOT_DIR.glob('cookie_prefs_*.parquet'))
    cookie_prefs = read_data_files(cookie_prefs_files) # pd.read_parquet(cookie_prefs_file)

    all_complies_files = list(SCAN_ROOT_DIR.glob('all_complies_*.parquet'))
    all_complies = read_data_files(all_complies_files)
    all_complies.head()

    nsites = all_complies.site.nunique()
    print(f'Number of sites with prefs: {cookie_prefs.site.nunique():,d}')
    print(f"Number of sites with consistency: {nsites:,d}")
    print(f'Number of flows analyzed: {len(all_complies):,d}')

    site_to_libname = {row['site']: row['lib_name'] for row in cookie_prefs[['site', 'lib_name']].drop_duplicates().to_dict('records')}
    all_complies['lib_name'] = all_complies.site.map(site_to_libname)
    all_complies_libs = all_complies.drop_duplicates(subset='site').lib_name.value_counts()
    all_complies_libs

    # all_complies: comply flows, complies: comply cookies
    complies = all_complies[['name', 'domain', 'path', 'site', 'comply']].drop_duplicates()
    comply_counts = complies.comply.value_counts()
    assert len(complies) == comply_counts.sum(), "Some cookie have 2 different comply values"
    comply_counts

    # Counts violated sites = at least 1 violation detected.
    comply_sites_data = defaultdict(list)
    for comply_type, comply_group in complies.groupby('comply'):
        n_sites = comply_group.site.nunique()
        comply_sites_data['comply_type'].append(comply_type)
        comply_sites_data['num_sites'].append(n_sites)
        comply_sites = pd.DataFrame(comply_sites_data).sort_values(by=['num_sites'], ascending=False)


    comply_sites['num_sites_percent'] = comply_sites['num_sites'] / nsites * 100
    comply_sites['num_cookies'] = comply_sites['comply_type'].map(comply_counts)
    comply_sites['num_cookies_percent'] = comply_sites['num_cookies'] / comply_sites['num_cookies'].sum() * 100  # need to read scan_*.parquet for n_br_cookies, but may be unnecessary
    comply_sites['num_cookies_per_site'] = comply_sites['num_cookies'] / comply_sites['num_sites']

    # Do not count comply/correct enforcement: which require detecting all possible
    comply_sites = comply_sites[comply_sites.comply_type != 'comply']
    comply_sites

    totals = comply_sites.sum()
    n_cookies_analyzed = totals['num_cookies']
    print(f'Num cookies analyzed: {n_cookies_analyzed:,d}')

    # Count complying (no violation detected)
    n_comply_sites = 0
    for site, site_comply in complies.groupby('site'):
        if set(site_comply.comply.unique()) == {'comply'}:
            n_comply_sites += 1
    n_comply_sites

    latex = comply_sites[['comply_type', 'num_cookies', 'num_sites', 'num_sites_percent']].copy()
    # latex['num_sites_combine'] = latex.apply(lambda row: f"{row['num_sites']} ({row['num_sites_percent']:.2f}%)", axis=1)
    latex['num_sites_combine'] = latex.apply(lambda row: f"{row['num_sites_percent']:.2f}% ({row['num_sites']:,d}/{nsites:,d})", axis=1)
    latex = latex[['comply_type', 'num_cookies', 'num_sites_combine']]
    latex = latex[latex.comply_type != 'comply']
    latex = latex.rename(columns={'comply_type': 'Violation Type', 'num_sites_combine': '# Websites', 'num_cookies': '# Cookies'})
    latex = latex.replace({'omit': "Consent Omission", "incorrect": "Rejected Cookie Usage", "ambiguous": "Ambiguous Consent", "comply": "Correct Enforcement"})
    latex['Location'] = location

    return latex

In [2]:
dfs = []
for location in LOCATIONS:
    dfs.append(process(location))

Process location capetown


100%|██████████| 1/1 [00:00<00:00,  4.25it/s]
100%|██████████| 1/1 [00:00<00:00, 34.41it/s]


Number of sites with prefs: 1,368
Number of sites with consistency: 1,361
Number of flows analyzed: 49,018
Num cookies analyzed: 37,106
Process location ireland


100%|██████████| 1/1 [00:00<00:00,  4.23it/s]
100%|██████████| 1/1 [00:00<00:00, 46.73it/s]


Number of sites with prefs: 1,444
Number of sites with consistency: 1,438
Number of flows analyzed: 29,694
Num cookies analyzed: 22,196
Process location london


100%|██████████| 1/1 [00:00<00:00,  5.43it/s]
100%|██████████| 1/1 [00:00<00:00, 41.86it/s]


Number of sites with prefs: 1,355
Number of sites with consistency: 1,350
Number of flows analyzed: 28,668
Num cookies analyzed: 21,507
Process location mi


100%|██████████| 1/1 [00:00<00:00,  5.19it/s]
100%|██████████| 1/1 [00:00<00:00, 28.36it/s]


Number of sites with prefs: 1,394
Number of sites with consistency: 1,391
Number of flows analyzed: 64,341
Num cookies analyzed: 51,163
Process location sf


100%|██████████| 1/1 [00:00<00:00,  4.15it/s]
100%|██████████| 1/1 [00:00<00:00, 30.78it/s]


Number of sites with prefs: 1,444
Number of sites with consistency: 1,439
Number of flows analyzed: 58,035
Num cookies analyzed: 48,557
Process location singapore


100%|██████████| 1/1 [00:00<00:00,  4.99it/s]
100%|██████████| 1/1 [00:00<00:00, 33.59it/s]


Number of sites with prefs: 1,330
Number of sites with consistency: 1,323
Number of flows analyzed: 49,172
Num cookies analyzed: 37,609
Process location sydney


100%|██████████| 1/1 [00:00<00:00,  5.08it/s]
100%|██████████| 1/1 [00:00<00:00, 32.41it/s]


Number of sites with prefs: 1,303
Number of sites with consistency: 1,300
Number of flows analyzed: 51,509
Num cookies analyzed: 39,507
Process location toronto


100%|██████████| 1/1 [00:00<00:00,  5.05it/s]
100%|██████████| 1/1 [00:00<00:00, 53.02it/s]


Number of sites with prefs: 1,351
Number of sites with consistency: 1,081
Number of flows analyzed: 23,753
Num cookies analyzed: 18,056


In [3]:
all_complies = pd.concat(dfs).reset_index(drop=True).set_index(['Location', 'Violation Type'])
all_complies

Unnamed: 0_level_0,Unnamed: 1_level_0,# Cookies,# Websites
Location,Violation Type,Unnamed: 2_level_1,Unnamed: 3_level_1
capetown,Consent Omission,22938,"91.26% (1,242/1,361)"
capetown,Rejected Cookie Usage,14099,"78.03% (1,062/1,361)"
capetown,Ambiguous Consent,69,"4.26% (58/1,361)"
ireland,Consent Omission,12445,"88.60% (1,274/1,438)"
ireland,Rejected Cookie Usage,9696,"79.55% (1,144/1,438)"
ireland,Ambiguous Consent,55,"3.34% (48/1,438)"
london,Consent Omission,12187,"89.26% (1,205/1,350)"
london,Rejected Cookie Usage,9271,"81.48% (1,100/1,350)"
london,Ambiguous Consent,49,"3.11% (42/1,350)"
mi,Consent Omission,35203,"92.24% (1,283/1,391)"
