## Third party ADP library prevalence analysis
- Find the number sites that embed a given script/endpoint

In [1]:
# For third-party scripts: we use script domains (PS+1) to measure the prevalence.
# For Wordpress and Magenta plugins served from first-party sites, we match the URL path.

ADP_PROVIDER_DOMAINS = {
    "Fomo": "fomo.com",
    "Beeketing": "beeketing.com",
    "Recently": "appifiny.io",
    "Fera": "fera.ai",
    "Vitals": "getvitals.io",
    "Nice (Shopify plugin)": "goldendev.win",
    "LeanConvert": "lc-api.net",
    "Taggstar": "taggstar.com",
    "Insider": "useinsider.com",
    "FreshRelevance": "dn1i8v75r669j.cloudfront.net",
    "Qubit": "goqubit.com",
    "Bunting": "bunting.com",
    "ConvertCart": "convertcart.com",
    "Proof": "useproof.com",
    "Convertize": "convertize.io",
    "Credibly": "credibly.io",
    "DynamicYield": "dynamicyield.com",
    "Bizzy": "pxu-recent-sales-apps.s3.amazonaws.com",
    "Exponea": "exponea.com",
    "Yieldify": "yieldify.com"
    # plugins
    "Amasty (Magento plugin)": "#amwhatsup/block/getlastactivity",
    "Boost (Wordpress plugin)": "#plugins/boost/public/js/boost",
    "Woocommerce Notification (Woocommerce plugin)": "#plugins/woocommerce-notification",

}


In [2]:
import re
import json
import sqlite3
import pandas as pd
import json
from collections import defaultdict
from urlparse import urlparse
from os.path import expanduser
from extract_tp_to_site_mapping import get_tld_or_host
pd.set_option("display.max_colwidth",500)
pd.set_option("display.max_rows",500)


In [3]:
def dump_as_json(obj, json_path):
    with open(json_path, 'w') as f:
        json.dump(obj, f)

def load_json_file(json_path):
    with open(json_path) as json_file:
        return json.load(json_file)


In [11]:
def get_embedding_sites_by_regex(endpoint_regexes, db_path):
    con = sqlite3.connect(db_path)
    con.row_factory = sqlite3.Row
    query = """SELECT sv.visit_id, sv.site_url, r.url, r.method,
                r.post_body FROM http_requests as r LEFT JOIN site_visits as sv
                ON sv.visit_id = r.visit_id
                """
    adp_sites = defaultdict(set)
    # print("Will run the query %s " % query)
    for row in con.execute(query):
        for endpoint_name, endpoint_regex in endpoint_regexes.iteritems():
            if re.search(endpoint_regex, row['url'].split("://")[-1]):
                host = urlparse(row['site_url']).hostname
                # print host, row['site_url']
                adp_sites[endpoint_name].add(host)

    return adp_sites


def get_embedding_sites(url_substrings, db_path):
    con = sqlite3.connect(db_path)
    con.row_factory = sqlite3.Row
    query = """SELECT sv.visit_id, sv.site_url, r.url, r.method,
                r.post_body FROM http_requests as r LEFT JOIN site_visits as sv
                ON sv.visit_id = r.visit_id"""
    adp_sites = defaultdict(set)
    # print("Will run the query %s " % query)
    for row in con.execute(query):
        for js_url in url_substrings:
            if js_url in row['url']:
                host = urlparse(row['site_url']).hostname
                # print host, row['site_url']
                adp_sites[js_url].add(host)

    return adp_sites

def get_embedding_sites_by_domain(endpoint_patterns, db_path):
    con = sqlite3.connect(db_path)
    con.row_factory = sqlite3.Row
    query = """SELECT sv.visit_id, sv.site_url, r.url FROM
                http_requests as r LEFT JOIN site_visits as sv
                ON sv.visit_id = r.visit_id
                """
    adp_sites = defaultdict(set)
    # print("Will run the query %s " % query)
    for row in con.execute(query):
        for endpoint_name, endpoint_pattern in endpoint_patterns.iteritems():
            if endpoint_pattern.startswith("#"):
                endpoint_pattern = endpoint_pattern.strip("#")
                if endpoint_pattern in row['url']:
                    host = urlparse(row['site_url']).hostname
                    adp_sites[endpoint_name].add(host)
            else:
                req_tld = get_tld_or_host(row['url'])
                if req_tld == endpoint_pattern:
                    host = urlparse(row['site_url']).hostname
                    adp_sites[endpoint_name].add(host)

    return adp_sites

def get_prevalence_counts(endpoint_patterns, db_path, db_path_2=None):
    adp_sites_db = get_embedding_sites_by_domain(endpoint_patterns, db_path)
    if db_path_2:
        adp_sites_db_2 = get_embedding_sites_by_domain(endpoint_patterns, db_path_2)
        for endpoint_name, sites in adp_sites_db_2.iteritems():
            adp_sites_db[endpoint_name] |= sites
    adp_prevalence = {endpoint_name: len(sites) for endpoint_name, sites in adp_sites_db.iteritems()}
    return adp_prevalence, adp_sites_db

## Compute third party prevalence using data from the checkout crawls

In [10]:
# path to checkout crawls
ODIN_DB_PATH = "/mnt/10tb4/dark-patterns-databases/odin/odin.sqlite"
WEBTAP_DB_PATH = "/mnt/10tb4/dark-patterns-databases/webtap/webtap.sqlite"

In [12]:
adp_prevalence, adp_sites_dict = get_prevalence_counts(ADP_PROVIDER_DOMAINS, ODIN_DB_PATH, WEBTAP_DB_PATH)
dump_as_json(adp_prevalence, "adp-third-party-lib-prevalence-odin-webtap-regex.json")

In [13]:
for endpoint_name, site_count in sorted(adp_prevalence.iteritems(), key=lambda x: x[1], reverse=True): 
    print endpoint_name, site_count

Beeketing 406
DynamicYield 114
Yieldify 111
Fomo 91
FreshRelevance 86
Insider 52
Bizzy 33
ConvertCart 31
Taggstar 27
Qubit 25
Exponea 18
Recently 14
Proof 11
Fera 11
Nice (Shopify plugin) 10
Woocommerce Notification (Woocommerce plugin) 10
Bunting 5
Credibly 4
Convertize 3
LeanConvert 2
Amasty (Magento plugin) 1
Boost (Wordpress plugin) 1


## Compute prevalence using data from Princeton Web Census 1-million Site Crawl
- We used the November 2018 crawl

In [6]:
ONE_MILLION_DB = expanduser("/mnt/10tb2/census-release-normalized/stateless/2018-11_1m_stateless/2018-11_1m_stateless_census_crawl.sqlite")

adp_prevalence, adp_sites_dict = get_prevalence_counts(ADP_PROVIDER_DOMAINS, ONE_MILLION_DB)
dump_as_json(adp_prevalence, "adp-third-party-lib-prevalence-one-million-sites.json")
for endpoint_name, site_count in sorted(adp_prevalence.iteritems(), key=lambda x: x[1], reverse=True): 
    print endpoint_name, site_count


Beeketing 4151
Fomo 663
Proof 508
Insider 484
DynamicYield 416
Yieldify 323
Bizzy 213
FreshRelevance 208
Exponea 180
Fera 132
Nice (Shopify plugin) 80
Qubit 73
Credibly 67
Recently 66
ConvertCart 62
Woocommerce Notification (Woocommerce plugin) 61
Convertize 58
Bunting 17
Taggstar 4
Boost (Wordpress plugin) 3
Vitals 1
