In [1]:
import pandas as pd


from consent.consistency.util import get_scan_dirs, get_scan_root_dir
from consent.util.default_path import get_data_dir

SCAN_DIRS = get_scan_dirs('eu')
SCAN_ROOT_DIR = get_scan_root_dir('eu')
suffix = '_' + '0k_200k'

In [24]:
cookie_prefs_files = SCAN_ROOT_DIR/'cookie_prefs_0k_200k.parquet'
cookie_prefs = pd.read_parquet(cookie_prefs_files) # pd.read_parquet(cookie_prefs_file)
cookie_prefs.head()

Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name,consent
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
1,OptanonAlertBoxClosed,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
2,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
3,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True
4,we_referring_domain,www.wework.com,14 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json,True


In [2]:
# TODO: add some flags, this is from pref_category_notebook.ipynb
data_dir = get_data_dir('2022-05-30') # '2022-01-14')
cookie_decls_file = data_dir / f'cookie_decls{suffix}.parquet'
raw_cookie_decls = pd.read_parquet(cookie_decls_file)
# raw_cookie_decls = read_cookie_decls_in_scans(SCAN_DIRS)
print(f"Num cookie declarations: {len(raw_cookie_decls):,d}")
raw_cookie_decls.head()

Num cookie declarations: 827,875


Unnamed: 0,name,domain,duration,category_id,category,consent_mode,site,lib_name,pattern_name
0,__we_bucket_id,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
1,OptanonAlertBoxClosed,wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
2,OptanonConsent,www.wework.com,365 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
3,__we_request_id,www.wework.com,Session,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json
4,we_referring_domain,www.wework.com,14 days,C0001,Strictly Necessary Cookies,always active,wework.com,onetrust,en.json


In [3]:
# Need to read this to report only the sites that have been analyzed
all_complies_file = SCAN_ROOT_DIR / f'all_complies{suffix}.parquet'
all_complies = pd.read_parquet(all_complies_file)
all_complies.head()

Unnamed: 0,name,domain,path,site,comply
0,OptanonAlertBoxClosed,.www.wework.com,/,wework.com,comply
1,ajs_user_id,.wework.com,/,wework.com,incorrect
2,we_referring_domain,www.wework.com,/,wework.com,comply
3,OptanonConsent,.www.wework.com,/,wework.com,comply
4,__we_bucket_id,www.wework.com,/,wework.com,comply


In [4]:
site_to_lib = {}
for row in raw_cookie_decls[['site', 'lib_name']].drop_duplicates().itertuples():
    site_to_lib[row.site] = row.lib_name
all_complies['lib_name'] = all_complies['site'].map(site_to_lib)

In [5]:
site_libs = all_complies[['site', 'lib_name']].drop_duplicates()

In [6]:
lib_counts = site_libs['lib_name'].value_counts().to_dict()
lib_counts

{'onetrust': 5221, 'cookiebot': 776, 'termly': 53}

In [31]:
site_to_libname = {row['site']: row['lib_name'] for row in cookie_prefs[['site', 'lib_name']].drop_duplicates().to_dict('records')}
all_complies['lib_name'] = all_complies.site.map(site_to_libname)
all_complies_libs = all_complies.drop_duplicates(subset='site').lib_name.value_counts()
print(all_complies_libs.sum())
# TODO: why this difference??
lib_counts = all_complies_libs
all_complies_libs

6050


onetrust     5219
cookiebot     778
termly         53
Name: lib_name, dtype: int64

In [32]:
inconsist = all_complies[all_complies.comply == 'incorrect'][['site', 'lib_name']].drop_duplicates()
inconsist.head()

Unnamed: 0,site,lib_name
1,wework.com,onetrust
14,southernliving.com,onetrust
59,health.com,onetrust
76,justpremium.com,onetrust
94,menlosecurity.com,onetrust


In [37]:
inconsist_libs = inconsist['lib_name'].value_counts().to_frame().reset_index().rename(columns={'lib_name': 'count', 'index': 'lib_name'})
inconsist_libs['normalized'] = inconsist_libs.apply(lambda row: row['count'] / lib_counts[row['lib_name']], axis=1)
inconsist_libs['count_percent'] = inconsist_libs.apply(lambda row: f"{row['count']:,d} ({row['normalized']*100:.2f}%)", axis=1)
print("Inconsistent libraries (normalized by number of library instances)")
latex_df = inconsist_libs[['lib_name', 'count_percent']].copy()
latex_df.rename(columns={'lib_name': 'CMP', 'count_percent': '# Violations'}, inplace=True)
latex_df.replace({'onetrust': '\onetrust', 'cookiebot': '\cookiebot', 'termly': r'\termly'}, inplace=True)
print(latex_df.style.hide(axis='index').format_index(escape='latex', axis=1).format(escape='latex').to_latex(hrules=True))

Inconsistent libraries (normalized by number of library instances)
\begin{tabular}{ll}
\toprule
CMP & \# Violations \\
\midrule
\textbackslash onetrust & 4,370 (83.73\%) \\
\textbackslash cookiebot & 553 (71.08\%) \\
\textbackslash termly & 50 (94.34\%) \\
\bottomrule
\end{tabular}

