In [2]:
import json
import os
from dolma.taggers.url import BaseDomainTagger
from typing import NamedTuple

In [3]:
class Matched(NamedTuple):
    source: str
    url: str
    count: int
    wiki: dict

In [4]:
url_reports = "/Users/lucas/Documents/v1_6url-reports/v1_6-reports"
wikidata = "/Users/lucas/Documents/wikidata/wikidata-20220208"

In [5]:
reports = {}
for report in os.listdir(url_reports):
    if not report.endswith(".json"):
        continue
    with open(os.path.join(url_reports, report)) as f:
        reports[report] = json.load(f)

In [20]:
wikidata_urls = {}

for concept in os.listdir(wikidata):
    if not os.path.isdir(os.path.join(wikidata, concept)):
        continue
    with open(os.path.join(wikidata, concept, 'response.json')) as f:
        for data in json.load(f):
            try:
                for url in BaseDomainTagger.clean_url(data['url']):
                    wikidata_urls[url] = {
                        "label": data["itemLabel"], "description": data.get("description", "")
                    }
            except:
                pass

In [7]:
matched = []
for name, report in reports.items():
    for url, count in report['domains']:
        base_url = '.'.join(url.strip('/').split('.')[:2])

        if url in wikidata_urls or base_url in wikidata_urls:
            matched.append(Matched(name.strip('.json'), url, count, wikidata_urls[url]))


In [8]:
seen = set()
for match in sorted(matched, key=lambda x: x.count, reverse=True):
    if match.url in seen:
        continue
    seen.add(match.url)
    print(f"{match.count:5}\t{match.source}\t{match.url}\t{match.wiki['description']}")

print(len(seen))

28891	isd_nsfw_abp_v1	www.bustle.com	magazine targeted to women
26912	isd_nsfw_abp_v1	www.fanfiction.net	US fanfiction website
23418	brave_nsfw_abp_v1	www.match.com	dating website
21587	blocklist_firebog_nsfw_v1	www.urbandictionary.com	crowdsourced online dictionary of slang terms
19722	isd_nsfw_abp_v1	www.refinery29.com	American digital media and entertainment company
17016	isd_nsfw_abp_v1	uproxx.com	entertainment website
15691	domain_blocklist_phishing_v1	www.amazon.com	American multinational technology company
14364	brave_nsfw_abp_v1	www.cosmopolitan.com	American fashion and culture magazine
13645	isd_nsfw_abp_v1	www.elitedaily.com	American news website
12759	isd_nsfw_abp_v1	booklikes.com	
12550	blocklist_project_vice_v1	www.drugs.com	online pharmaceutical encyclopedia
12442	isd_nsfw_abp_v1	nymag.com	online newspaper
10609	blocklist_project_ads_v1	www.webmd.com	website about medicine and health
 9749	blocklist_project_crime_v1	www.angelfire.com	website hosting service
 9511	brave_ns

In [16]:
agg = {}
for name, report in reports.items():
    for url, count in report["domains"]:
        if url not in seen:
            agg[url] = max(count, agg.get(url, 0))

for url, count in sorted(agg.items(), key=lambda x: x[1], reverse=True):
    if count < 1000:
        continue
    print(f"{count:5}\thttps://{url}")

46855	https://s3-us-west-1.amazonaws.com
26994	https://community.adobe.com
26212	https://s3.amazonaws.com
23652	https://m.fanfiction.net
18194	https://www.appbrain.com
18091	https://www.instantcheckmate.com
17399	https://travel.travelocity.com
16395	https://www.fimfiction.net
16171	https://thoughtcatalog.com
15858	https://article.wn.com
14872	https://www.easycounter.com
14690	https://intl2.match.com
14049	https://hypestat.com
13442	https://www.appannie.com
13320	https://www4.match.com
13301	https://www.mamma.com
13204	https://en.academic.ru
12252	https://www.etonline.com
12250	https://asylums.insanejournal.com
12207	https://www.newsbreak.com
10052	https://www.thehollywoodgossip.com
 9832	https://www.phonearena.com
 9612	https://boards.greenhouse.io
 9466	https://www.123helpme.com
 9443	https://www.simplyhired.com
 9393	https://modthesims.info
 8926	https://gateway.ipfs.io
 8711	https://www.winsite.com
 8443	https://www.adlandpro.com
 8005	https://www.adsoftheworld.com
 7981	https://goy