In [1]:
from itertools import chain
from pathlib import Path
from collections import defaultdict

import pandas as pd

from consent.util.default_path import get_data_dir

brian_anno_dirs = [
    Path.home() / 'Dropbox/projects/consent/shared_files/consentchk/brian_2021-07-31_cookieset_top_5k/second_half/brian/us',
    Path.home() / 'Dropbox/projects/consent/shared_files/consentchk/brian_2021-07-31_cookieset_top_5k/first_half/brian/us',
    Path.home() / 'Dropbox/projects/consent/shared_files/consentchk/brian_2021-07-28_sample_500_5k_10k/first_half/brian/us',
    Path.home() / 'Dropbox/projects/consent/shared_files/consentchk/brian_2021-07-28_sample_500_5k_10k/second_half/brian/us'
]
assert all(adir.exists() for adir in brian_anno_dirs)

In [2]:
def get_sites(label_dir):
    return [f.stem for f in label_dir.glob('**/*') if not f.stem.endswith('_bottom') and f.is_file()]

def get_labels(anno_dir):
    data = []
    for high_label_dir in anno_dir.glob('*'):
        for low_label_dir in high_label_dir.glob('*'):
            label = f"{high_label_dir.name}_{low_label_dir.name}"
            for site in get_sites(low_label_dir):
                data.append({'label': label, 'site': site, 'path': str(low_label_dir.parent.parent.name) + '/' + str(low_label_dir.parent.name) + '/' + str(low_label_dir.name)})
    return pd.DataFrame(data)

brian_labels = pd.concat(chain(get_labels(adir) for adir in brian_anno_dirs))
print("Num sites:", len(brian_labels))
brian_labels.head()

Num sites: 994


Unnamed: 0,label,site,path
0,no_setting_no_banner,tasteofhome.com,us/no_setting/no_banner
1,no_setting_no_banner,scholastic.com,us/no_setting/no_banner
2,no_setting_no_banner,yorku.ca,us/no_setting/no_banner
3,no_setting_no_banner,perl.com,us/no_setting/no_banner
4,no_setting_no_banner,smallseotools.com,us/no_setting/no_banner


In [3]:
# print(brian_labels.site.unique())

In [4]:
duc_anno_dirs = [
    # get_data_dir('2021-07-28/sample_500_5k_10k'), get_data_dir('2021-07-31') / 'cookieset_top_5k', get_data_dir('2021-08-03') / 'sample_45_10k']
    get_data_dir('2021-11-26/merge_pref_btn_dataset/07_28_5k_10k'),
    get_data_dir('2021-11-26/merge_pref_btn_dataset/07_31_top_5k')
]

assert all(adir.is_dir() for adir in duc_anno_dirs)
duc_labels = pd.concat(list(chain([get_labels(adir) for adir in duc_anno_dirs])))
print("Num sites:", len(duc_labels))
duc_labels.head()

Num sites: 1000


Unnamed: 0,label,site,path
0,cookie_setting_banner_with_setting,campaignlive.co.uk,07_28_5k_10k/cookie_setting/banner_with_setting
1,cookie_setting_banner_with_setting,tomtom.com,07_28_5k_10k/cookie_setting/banner_with_setting
2,cookie_setting_banner_with_setting,hyundai.com,07_28_5k_10k/cookie_setting/banner_with_setting
3,cookie_setting_banner_with_setting,uniregistry.com,07_28_5k_10k/cookie_setting/banner_with_setting
4,cookie_setting_banner_with_setting,highcharts.com,07_28_5k_10k/cookie_setting/banner_with_setting


In [5]:
# assert set(duc_labels.label.unique()) == set(brian_labels.label.unique())

In [6]:
iaa_labels = brian_labels.merge(duc_labels, how='left', on='site', suffixes=['_brian', '_duc'])
iaa_labels.head()

Unnamed: 0,label_brian,site,path_brian,label_duc,path_duc
0,no_setting_no_banner,tasteofhome.com,us/no_setting/no_banner,no_setting_no_banner,07_31_top_5k/no_setting/no_banner
1,no_setting_no_banner,scholastic.com,us/no_setting/no_banner,no_setting_no_banner,07_31_top_5k/no_setting/no_banner
2,no_setting_no_banner,yorku.ca,us/no_setting/no_banner,no_setting_no_banner,07_31_top_5k/no_setting/no_banner
3,no_setting_no_banner,perl.com,us/no_setting/no_banner,no_setting_no_banner,07_31_top_5k/no_setting/no_banner
4,no_setting_no_banner,smallseotools.com,us/no_setting/no_banner,no_setting_banner_no_choice,07_31_top_5k/no_setting/banner_no_choice


In [7]:
# all-in-one comparison
assert all(adir.is_dir() for adir in duc_anno_dirs)
duc_labels = pd.concat(list(chain([get_labels(adir) for adir in duc_anno_dirs])))
# iaa_labels = brian_labels.merge(duc_labels, how='left', on='site', suffixes=['_brian', '_duc'])
iaa_labels = duc_labels.merge(brian_labels, how='left', on='site', suffixes=['_duc', '_brian'])

fixing_sites = []# To add Adchoice; 'careerbuilder.com', 'foodnetwork.com',  # form-filling requests
                # 'fox8.com', 'freep.com', 'adcolony.com', 'bostonherald.com', 'lolesports.com', 'bodybuilding.com', 'commonsensemedia.org', 'formula1.com', 'hbomax.com', 'cntraveler.com', 'churchofjesuschrist.org', 'glassdoor.com', 'deseret.com', 'core.ac.uk', 'instyle.com', 'dictionary.com', 'androidpolice.com', 'licdn.com', 'goodhousekeeping.com', 'allure.com', # these contain cookie setting annotation
                # 'avocet.io'  # contain cookie setting ]
iaa_labels = iaa_labels[~iaa_labels.site.isin(fixing_sites)]

diff = iaa_labels[iaa_labels.label_brian != iaa_labels.label_duc].sort_values(by=['label_brian', 'label_duc']).reset_index(drop=True)
# diff = diff.dropna()
# diff = diff[diff.label_duc.str.startswith('cookie_setting')].reset_index(drop=True)
# diff = diff[diff.label_brian.str.startswith('cookie_setting')].reset_index(drop=True)
# print(diff.site.tolist())
print("Total sites", iaa_labels.site.nunique())
print("Num diff sites:", len(diff))
diff # diff.head()

Total sites 1000
Num diff sites: 43


Unnamed: 0,label_duc,site,path_duc,label_brian,path_brian
0,cookie_setting_banner_no_choice,adcolony.com,07_31_top_5k/cookie_setting/banner_no_choice,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting
1,cookie_setting_no_banner,avocet.io,07_28_5k_10k/cookie_setting/no_banner,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting
2,cookie_setting_no_banner,freep.com,07_31_top_5k/cookie_setting/no_banner,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting
3,cookie_setting_no_banner,fox8.com,07_31_top_5k/cookie_setting/no_banner,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting
4,no_setting_banner_no_choice,adswizz.com,07_28_5k_10k/no_setting/banner_no_choice,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting
5,cookie_setting_banner_no_choice,bostonherald.com,07_31_top_5k/cookie_setting/banner_no_choice,cookie_setting_no_banner,us/cookie_setting/no_banner
6,cookie_setting_banner_no_choice,lolesports.com,07_31_top_5k/cookie_setting/banner_no_choice,cookie_setting_no_banner,us/cookie_setting/no_banner
7,cookie_setting_banner_with_setting,bodybuilding.com,07_31_top_5k/cookie_setting/banner_with_setting,cookie_setting_no_banner,us/cookie_setting/no_banner
8,cookie_setting_banner_with_setting,commonsensemedia.org,07_31_top_5k/cookie_setting/banner_with_setting,cookie_setting_no_banner,us/cookie_setting/no_banner
9,no_setting_banner_no_choice,spreadshirt.com,07_31_top_5k/no_setting/banner_no_choice,no_setting_banner_binary_choice,us/no_setting/banner_binary_choice


In [8]:
iaa_labels[iaa_labels.site == 'adswizz.com']

Unnamed: 0,label_duc,site,path_duc,label_brian,path_brian
84,no_setting_banner_no_choice,adswizz.com,07_28_5k_10k/no_setting/banner_no_choice,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting


In [9]:
def get_common_labels(row):
    if row['label_duc'].startswith('cookie_setting') or row['site'] in ['adswizz.com'] or pd.isna(row['label_brian']):
        return row['label_duc']
    return row['label_brian']
iaa_labels['label_common'] = iaa_labels.apply(get_common_labels, axis=1)
iaa_labels['label'] = iaa_labels.label_common.map(lambda label: '_'.join(label.split('_', 2)[:2]))
iaa_labels['banner_type'] = iaa_labels.label_common.map(lambda label: label.split('_', 2)[2])
iaa_labels.head()

Unnamed: 0,label_duc,site,path_duc,label_brian,path_brian,label_common,label,banner_type
0,cookie_setting_banner_with_setting,campaignlive.co.uk,07_28_5k_10k/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,cookie_setting,banner_with_setting
1,cookie_setting_banner_with_setting,tomtom.com,07_28_5k_10k/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,cookie_setting,banner_with_setting
2,cookie_setting_banner_with_setting,hyundai.com,07_28_5k_10k/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,cookie_setting,banner_with_setting
3,cookie_setting_banner_with_setting,uniregistry.com,07_28_5k_10k/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,cookie_setting,banner_with_setting
4,cookie_setting_banner_with_setting,highcharts.com,07_28_5k_10k/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,us/cookie_setting/banner_with_setting,cookie_setting_banner_with_setting,cookie_setting,banner_with_setting


In [10]:
from consent.data.cookie_setting import CookieSetting
csites = set(iaa_labels[iaa_labels.label == 'cookie_setting'].site)
asites = set(CookieSetting.get_cookie_settings(nocache=True, pref_btn_only=False).site)
if csites != asites:
    print("Missing in cookie_settings.yml", csites - asites)
    print("Missing in annotations", asites - csites)
assert csites == asites, "Annotations is different from dataset."

In [11]:
banner_type_map = {'banner_no_choice': 'Notification only', 'banner_with_setting': 'With a setting button', 'banner_binary_choice': 'Binary choice', 'no_banner': 'No Banner', 'total': "Total"}
label_map = {'cookie_setting': 'Has cookie settings', 'no_setting': "No cookie settings"}
df = iaa_labels.groupby(['label', 'banner_type']).size().reset_index().rename(columns={0: 'num_banners'}).copy()
# df = iaa_labels.copy()
df['banner_type'] = df['banner_type'].map(banner_type_map)
df['label'] = df['label'].map(label_map)
df = df[['label', 'banner_type', 'num_banners']]
df = df.rename(columns={'label':'Website Type', 'banner_type': 'Banner Type', 'num_banners': '# Websites'})
# df = df.set_index(['Website Type', 'Banner Type'])
print('total sites:', df['# Websites'].sum())  # Divide by 2 because we added total rows
for website_type in label_map.values():
    total = df[df['Website Type'] == website_type]['# Websites'].sum()
    print(website_type, total)
    df = df.append({'Website Type': website_type, 'Banner Type': 'Total', '# Websites': total}, ignore_index=True)
df


total sites: 1000
Has cookie settings 192
No cookie settings 808


Unnamed: 0,Website Type,Banner Type,# Websites
0,Has cookie settings,Notification only,3
1,Has cookie settings,With a setting button,121
2,Has cookie settings,No Banner,68
3,No cookie settings,Binary choice,15
4,No cookie settings,Notification only,135
5,No cookie settings,No Banner,658
6,Has cookie settings,Total,192
7,No cookie settings,Total,808


In [12]:
gdf = df.groupby(['Website Type', 'Banner Type']).sum().sort_index()
grand_total = gdf.loc[('Has cookie settings', 'Total')] + gdf.loc[('No cookie settings', 'Total')]
gdf = gdf.append(pd.Series(grand_total, name=('Total', '')))
gdf = gdf[gdf['# Websites'] > 0]
gdf

Unnamed: 0_level_0,Unnamed: 1_level_0,# Websites
Website Type,Banner Type,Unnamed: 2_level_1
Has cookie settings,No Banner,68
Has cookie settings,Notification only,3
Has cookie settings,Total,192
Has cookie settings,With a setting button,121
No cookie settings,Binary choice,15
No cookie settings,No Banner,658
No cookie settings,Notification only,135
No cookie settings,Total,808
Total,,1000


In [13]:
print(gdf.to_latex(multirow=True))

\begin{tabular}{llr}
\toprule
      &       &  \# Websites \\
Website Type & Banner Type &             \\
\midrule
\multirow{4}{*}{Has cookie settings} & No Banner &          68 \\
      & Notification only &           3 \\
      & Total &         192 \\
      & With a setting button &         121 \\
\cline{1-3}
\multirow{4}{*}{No cookie settings} & Binary choice &          15 \\
      & No Banner &         658 \\
      & Notification only &         135 \\
      & \multirow{2}{*}{Total} &         808 \\
\cline{1-3}
Total &       &        1000 \\
\bottomrule
\end{tabular}

