# Find 3rd parties common to sites with ADPs
High level idea: compare the prevalence of TPs on sites with ADPs to sites without ADPs
- comparison will be performed per ADP type; e.g. compare the prevalence of TPs on social proof sites to non-social proof sites (may also include timer sites amd stock sites)
- sites with ADPs are read from the long term monitoring URLs (TODO: remove timer false positives)

for each ADP:
- build a frequency dict of TPs from sites with this specific ADP
 - exclude images, CSS, fonts etc. (see filtering by content_policy_type below)
 - dict will look like {e.fomo.com -> 5} (5 is no of distinct sites this TP is embedded on))
- build a frequency dict of TPs from sites that don't include this specific ADP
  for each TP:
  - compare the prevalence across ADP sites vs non-ADP sites (for this specific ADP)
  - list TPs that only appear on ADP sites and don't appear on the non-ADP sites
  - list TPs that are 2x more common on ADP sites compared to non-ADP sites (2 is an arbitrary threshold that we can tweak)

In [3]:
import sqlite3
import pandas as pd
import re
import json
import os
from glob import glob
from collections import defaultdict
from os.path import basename, abspath, join, expanduser
from urlparse import urlparse
from extract_tp_to_site_mapping import is_third_party

pd.options.display.max_colwidth = 450
pd.options.display.html.use_mathjax = False

In [4]:
def get_adp_urls():
    timer_urls = []
    soc_proof_urls = []
    stock_urls = []
    # this dir should be in the repo
    for url_file in glob("../../data/monitoring/urls/*"):
        # print url_file
        for l in open(url_file):
            page_url = l.rstrip()
            #print page_url
            if basename(url_file) == "stock_notifications.txt":
                stock_urls.append(page_url)
            elif basename(url_file) == "social_proofs.txt":
                soc_proof_urls.append(page_url)
            elif basename(url_file) == "timers.csv":
                # TODO: timer URLs likely include false positives, we may want to use the
                # list of sites with manipulative and deceptive timers.
                timer_urls.append(page_url)
    return timer_urls, soc_proof_urls, stock_urls



In [None]:
"""
mapping from Steve Englehardt (TODO: add to the acknowledgements)

content_policy_type_map = {
  '1': 'other',
  '2': 'script',
  '3': 'image',
  '4': 'stylesheet',
  '5': 'object',
  '6': 'document',
  '7': 'subdocument',
  '8': 'refresh',
  '9': 'xbl',
  '10': 'ping',
  '11': 'xhr',
  '12': 'object_subrequest',
  '13': 'dtd',
  '14': 'font',
  '15': 'media',
  '16': 'websocket',
  '17': 'csp_report',
  '18': 'xslt',
  '19': 'beacon',
  '20': 'fetch',
  '21': 'imageset',
  '22': 'web_manifest'
}
"""
def read_tp_requests(db_path):
    """Return all requests observed on the given site_urls."""
    con = sqlite3.connect(db_path)
    df = pd.read_sql("""
    SELECT re.url, re.content_policy_type, sv.site_url, sv.visit_id
      FROM http_requests AS re
        LEFT JOIN site_visits AS sv ON re.visit_id = sv.visit_id
        WHERE re.content_policy_type in (1, 2, 6, 7, 11, 16, 20)
           """, con)
    df['is_third_party'] = df.apply(lambda x: bool(is_third_party(x["url"], x["site_url"])[0]), axis=1)
    return df[df.is_third_party]


In [42]:
TESTING = False

timer_urls, soc_proof_urls, stock_urls = get_adp_urls()

if TESTING:
    soc_proof_urls.append("https://www.holabirdsports.com/collections/brand-new-babolat-sfx3/products/babolat-sfx3-all-court-womens-white-silver")
    soc_proof_urls.append("https://www.6ku.com/products/6ku-8-speed-city-bike-women")

    ODIN_DB_PATH = expanduser("~/20190202-151238_countdown_detection_crawl/20190202-151238_countdown_detection_crawl.sqlite")
    WEBTAP_DB_PATH = expanduser("~/20190204-011001_segmentation_pilot/20190204-011001_segmentation_pilot.sqlite")
else:
    ODIN_DB_PATH = "/media/gacar/Data/dp/20190206-203758_segmentation_pilot/20190206-203758_segmentation_pilot.sqlite"
    WEBTAP_DB_PATH = "/media/gacar/Data/dp/20190206-205000_segmentation_pilot/20190206-205000_segmentation_pilot.sqlite"



In [43]:
all_adp_urls = timer_urls + soc_proof_urls + stock_urls

odin_df = read_tp_requests(ODIN_DB_PATH)
print "Finished reading reqs from ODIN", len(odin_df)
webtap_df = read_tp_requests(WEBTAP_DB_PATH)
print "Finished reading reqs from WEBTAP", len(webtap_df)

# all_reqs[all_reqs.site_url.isin(site_urls)]
df = pd.concat([odin_df, webtap_df])
# we remove query, hash and the scheme from the URL
df['base_url'] = df['url'].map(lambda x: x.split("?")[0].split("#")[0].split("://")[-1])
df['site_hostname'] = df['site_url'].map(lambda x: urlparse(x).hostname)
df['stock'] = df['site_url'].map(lambda x: x in stock_urls)
df['social_proof'] = df['site_url'].map(lambda x: x in soc_proof_urls)
df['timer'] = df['site_url'].map(lambda x: x in timer_urls)
df.to_pickle("tp_reqs.pickle")
df.head(3)

Unnamed: 0,url,content_policy_type,site_url,visit_id,is_third_party,base_url,site_hostname,stock,social_proof,timer
20,https://cdn.cquotient.com/js/v2/gretel.min.js,2,https://www.mackenzie-childs.com/patience-brewster-ruby-red-wine-girl-tea-towel/08-30943.html,4,True,cdn.cquotient.com/js/v2/gretel.min.js,www.mackenzie-childs.com,False,False,False
26,https://cdn.shopify.com/s/files/1/1042/2862/t/94/assets/app.js?9898202562766866050,2,https://slickcaseofficial.com/collections/custom-macbook-case/products/custom-macbook-case-grassplat-greenery,1,True,cdn.shopify.com/s/files/1/1042/2862/t/94/assets/app.js,slickcaseofficial.com,False,False,False
27,https://cdn.shopify.com/s/assets/storefront/express_buttons-c5e5c7645d98c4cf4c1a29cdabc90ab45a62044dd11e47d8c671a71188cd89dc.js,2,https://slickcaseofficial.com/collections/custom-macbook-case/products/custom-macbook-case-grassplat-greenery,1,True,cdn.shopify.com/s/assets/storefront/express_buttons-c5e5c7645d98c4cf4c1a29cdabc90ab45a62044dd11e47d8c671a71188cd89dc.js,slickcaseofficial.com,False,False,False


In [44]:
print df.stock.value_counts()
print df.social_proof.value_counts()
print df.timer.value_counts()

False    131091
True       5220
Name: stock, dtype: int64
False    136183
True        128
Name: social_proof, dtype: int64
False    130995
True       5316
Name: timer, dtype: int64


In [45]:
soc_proof_tps = df[df.social_proof].groupby('base_url').\
    agg({'site_hostname': lambda x: len(set(x))}).reset_index()
soc_proof_tps.rename(columns={"site_hostname": "distinct_sites"}, inplace=True)
soc_proof_tps = soc_proof_tps[soc_proof_tps.distinct_sites >= 2]
soc_proof_tps.sort_values("distinct_sites", ascending=False, inplace=True)
soc_proof_tps_prevalence = soc_proof_tps.set_index('base_url')['distinct_sites'].to_dict()



In [46]:
non_soc_proof_tps = df[~df.social_proof].\
    groupby('base_url').agg({'site_hostname': lambda x: len(set(x))}).reset_index()
non_soc_proof_tps.rename(columns={"site_hostname": "distinct_sites"}, inplace=True)
non_soc_proof_tps_prevalence = non_soc_proof_tps.set_index('base_url')['distinct_sites'].to_dict()



In [48]:
for base_url, prevalence in soc_proof_tps_prevalence.iteritems():
    if prevalence < 2:
        continue
    if base_url not in non_soc_proof_tps_prevalence:
        print base_url, "only in soc proof sites"
        continue
    if (prevalence // non_soc_proof_tps_prevalence[base_url]) > 2:
        print base_url, "twice more likely to be present in soc proof sites"

In [49]:
soc_proof_tps

Unnamed: 0,base_url,distinct_sites
