## Third party ADP library prevalence analysis
- Find the number sites that embed a given script/endpoint

In [19]:
# include script domains or urls
# if the backend is in a different domain, include it seperately
ADP_URLS = [
    "e.fomo.com",
    "beeketing.com",
    "taggstar.com",
    "d10lpsik1i8c69.cloudfront.net/TEST.JS"  # we can include path as well
]


In [20]:
import json
import sqlite3
import pandas as pd
import json
from collections import defaultdict
from urlparse import urlparse
from os.path import expanduser

pd.set_option("display.max_colwidth",500)
pd.set_option("display.max_rows",500)


In [21]:
def dump_as_json(obj, json_path):
    with open(json_path, 'w') as f:
        json.dump(obj, f)

def load_json_file(json_path):
    with open(json_path) as json_file:
        return json.load(json_file)


## TODO: Run using the correct DB paths

In [30]:
ODIN_DB_PATH = "/media/gacar/Data/dp/20190206-203758_segmentation_pilot/20190206-203758_segmentation_pilot.sqlite"
WEBTAP_DB_PATH = "/media/gacar/Data/dp/20190206-205000_segmentation_pilot/20190206-205000_segmentation_pilot.sqlite"

In [25]:
def get_embedding_sites(url_substrings, db_path):
    con = sqlite3.connect(db_path)
    con.row_factory = sqlite3.Row
    query = """SELECT sv.visit_id, sv.site_url, r.url, r.method,
                r.post_body FROM http_requests as r LEFT JOIN site_visits as sv
                ON sv.visit_id = r.visit_id"""
    adp_sites = defaultdict(set)
    # print("Will run the query %s " % query)
    for row in con.execute(query):
        for js_url in url_substrings:
            if js_url in row['url']:
                host = urlparse(row['site_url']).hostname
                # print host, row['site_url']
                adp_sites[js_url].add(host)

    return adp_sites

def get_prevalence_counts(url_substrings, db_path, db_path_2=None):
    adp_sites_db = get_embedding_sites(url_substrings, db_path)
    if db_path_2:
        adp_sites_db_2 = get_embedding_sites(url_substrings, db_path_2)
        for js_url, sites in adp_sites_db_2.iteritems():
            adp_sites_db[js_url] |= sites
    adp_prevalence = {url: len(sites) for url, sites in adp_sites_db.iteritems()}
    return adp_prevalence, adp_sites_db

## Compute prevalence using two DBs (e.g. odin and webtap)

In [28]:
adp_prevalence, adp_sites_dict = get_prevalence_counts(ADP_URLS, ODIN_DB_PATH, WEBTAP_DB_PATH)
dump_as_json(adp_prevalence, "adp-third-party-lib-prevalence-odin-webtap.json")
for js_url, site_count in adp_prevalence.iteritems():
    # print js_url, adp_sites
    print js_url, site_count

beeketing.com 3
taggstar.com 2


## Compute prevalence using 1 DBs (census 1-million crawl DB)

In [29]:
ONE_MILLION_DB = expanduser("~/20190202-151238_countdown_detection_crawl/20190202-151238_countdown_detection_crawl.sqlite")

adp_prevalence, adp_sites_dict = get_prevalence_counts(ADP_URLS, ONE_MILLION_DB)
dump_as_json(adp_prevalence, "adp-third-party-lib-prevalence-one-million-sites.json")
for js_url, site_count in adp_prevalence.iteritems():
    # print js_url, adp_sites
    print js_url, site_count