In [1]:
import json
import numpy as np
import pandas as pd
from os import listdir, path
from collections import defaultdict, Counter

In [2]:
asset_root = 'whotracksme/data/assets'
regions = set()
months = set()
file_names = set()
asset_paths = defaultdict(list)

In [3]:
for d in listdir(asset_root):
    if path.isdir(path.join(asset_root, d)):
        months.add(d)
months

{'2017-05',
 '2017-06',
 '2017-07',
 '2017-08',
 '2017-09',
 '2017-10',
 '2017-11',
 '2017-12',
 '2018-01',
 '2018-02',
 '2018-03',
 '2018-04',
 '2018-05',
 '2018-06',
 '2018-07',
 '2018-08',
 '2018-09'}

In [4]:
for month in months:
    month_path = path.join(asset_root, month)
    for d in listdir(month_path):
        if path.isdir(path.join(month_path, d)):
            regions.add(d)
regions

{'de', 'eu', 'fr', 'global', 'us'}

In [5]:
for month in months:
    for region in regions:
        end_path = path.join(asset_root, month, region)
        if path.isdir(end_path):
            for file in listdir(end_path):
                file_names.add(file)
file_names

{'companies.csv',
 'domains.csv',
 'sites.csv',
 'sites_trackers.csv',
 'trackers.csv'}

In [6]:
asset_types = ['companies', 'domains', 'sites', 'sites_trackers', 'trackers']
for month in months:
    for region in regions:
        end_path = path.join(asset_root, month, region)
        if path.isdir(end_path):
            for file in listdir(end_path):
                for asset_type in asset_types:
                    if asset_type in file:
                        asset_paths[asset_type].append(path.join(end_path, file))
for asset_type in asset_types:
    print(asset_type, len(asset_paths[asset_type]))

companies 42
domains 42
sites 84
sites_trackers 42
trackers 84


In [7]:
for asset_type in asset_types:
    file_lengths = []
    for asset_path in asset_paths[asset_type]:
        num_lines = sum(1 for line in open(asset_path))
        file_lengths.append(num_lines)
    print(asset_path, f'{sum(file_lengths):,}')

whotracksme/data/assets/2018-03/eu/companies.csv 21,015
whotracksme/data/assets/2018-03/eu/domains.csv 37,606
whotracksme/data/assets/2018-03/eu/sites.csv 1,632,769
whotracksme/data/assets/2018-03/eu/sites_trackers.csv 1,573,799
whotracksme/data/assets/2018-03/eu/trackers.csv 1,599,747


But actually, for our purposes, we want domains: https://github.com/cliqz-oss/whotracks.me/issues/125

In [8]:
domains_df = pd.concat([
    pd.read_csv(file, parse_dates=['month'])
    for file in asset_paths['domains']
])
domains_df.head()

Unnamed: 0,bad_qs,beacon,companies,content_length,cookies,country,font,has_blocking,host_tld,hosts,...,requests,requests_failed,requests_tracking,script,site_reach,site_reach_rank,stylesheet,tracked,trackers,xhr
0,0.324662,0.079969,1.0,13726.92,0.002,global,0.0,0.0,google-analytics.com,1.0,...,4.231643,0.318025,0.952961,0.983787,0.749662,1,1.680424e-07,0.32581,1.0,0.002529
1,0.013944,0.0028,1.0,1365321.0,0.956131,global,7.631806e-08,0.0,google.com,1.0,...,7.78297,0.647582,7.490663,0.644835,0.553807,5,0.04151588,0.956558,1.0,0.119335
2,0.081903,5.5e-05,1.0,54979.46,0.894208,global,5.040725e-06,0.0,doubleclick.net,1.0,...,7.878777,0.407675,7.813173,0.529167,0.558613,4,7.268649e-05,0.895765,1.0,0.08695
3,0.000671,0.0,1.0,228892.8,0.002573,global,9.322723e-05,0.0,googleapis.com,1.0,...,5.070778,0.615104,0.046123,0.488916,0.706873,2,0.7194772,0.003242,1.25111,0.029339
4,0.000739,0.021324,1.0,109216.2,0.271632,global,0.4240526,0.0,gstatic.com,1.0,...,9.226768,0.505585,2.643849,0.546439,0.597873,3,0.06435847,0.272065,1.0,0.024668


In [9]:
bad_hosts = domains_df[domains_df.bad_qs > 0.1].host_tld
print(bad_hosts.nunique())

292


In [10]:
acceptable_hosts = domains_df[domains_df.bad_qs <= 0.1].host_tld
print(acceptable_hosts.nunique())

2094


In [11]:
thresholds = [0.0001, 0.0004, 0.0007, 0.001, 0.004, 0.007, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1]
bad_hosts = {}
acceptable_hosts = {}
for threshold in thresholds:
    bad = domains_df[domains_df.bad_qs > threshold].host_tld
    bad_hosts[threshold] = list(bad.unique())
    print(threshold, bad.nunique())
    acceptable = domains_df[domains_df.bad_qs <= threshold].host_tld
    acceptable_hosts[threshold] = list(acceptable.unique())

0.0001 1411
0.0004 1329
0.0007 1286
0.001 1248
0.004 1084
0.007 988
0.01 908
0.04 521
0.07 361
0.1 292
0.4 76
0.7 17
1 0


In [12]:
with open('../../mozilla/overscripted-clustering/tests/whotracksme/fingerprinting_trackers.json', 'w') as f:
    f.write(json.dumps(bad_hosts))
    
with open('../../mozilla/overscripted-clustering/tests/whotracksme/not_fingerprinting_trackers.json', 'w') as f:
    f.write(json.dumps(acceptable_hosts))