In [1]:
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import re
import s3fs
import six
from pyarrow.filesystem import S3FSWrapper

In [2]:
# import some analysis utilities from https://github.com/mozilla/openwpm-utils
import sys
import utils.openwpm_utils.domain as du
import utils.openwpm_utils.analysis as au

In [3]:
# for data stored in S3
BUCKET = 'openwpm-crawls'
CRAWL_DIR = 'openwpm-crawl'
BUCKET_URI = '%s/%s/visits/%%s' % (BUCKET, CRAWL_DIR)
fs = s3fs.S3FileSystem()

def dataset(table_name):
    return pq.ParquetDataset(
        BUCKET_URI % table_name,
        filesystem=fs,
        metadata_nthreads=4
    )

In [4]:
# or for local data
CRAWL_DIR = 'openwpm-crawl'
LOCAL_BUCKET_DATA_URI = '../deployment/local/local-crawl-results/data/%s/visits/%%s' % (CRAWL_DIR)

def dataset(table_name):
    return pq.ParquetDataset(
        LOCAL_BUCKET_DATA_URI % table_name,
    )

# HTTP Requests

In [5]:
# Load the data
table_name = 'http_requests'
reqs = dataset(table_name).read_pandas().to_pandas()

### Add some additional columns to help with analysis

In [6]:
# Add the public suffix + 1 of a bunch of the URL columns
reqs['url_ps1'] = reqs['url'].apply(du.get_ps_plus_1)
reqs['top_ps1'] = reqs['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
reqs['loading_ps1'] = reqs['loading_href'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

In [7]:
# Parse some info out of the call stack
reqs['stack_scripts'] = reqs['req_call_stack'].apply(au.get_script_urls_from_call_stack_as_set)
reqs['stack_ps1s'] = reqs['stack_scripts'].apply(lambda x: set([du.get_ps_plus_1(y) for y in x]))

In [8]:
total_sites = reqs['top_level_url'].nunique()

### How many sites is doubleclick.net loaded on?

In [9]:
dc_reqs = reqs[reqs['url_ps1'] == 'doubleclick.net']
dc_reqs.top_level_url.nunique() / float(total_sites)

0.0

In [10]:
ga_reqs = reqs[reqs['url_ps1'] == 'google-analytics.com']
ga_reqs.top_level_url.nunique() / float(total_sites)

0.2

### What domains does doubleclick.net load other resources from?

In [11]:
reqs_with_call_stacks = reqs[
  reqs.req_call_stack.notnull()
]

In [12]:
reqs_with_call_stacks['req_call_stack']

Series([], Name: req_call_stack, dtype: object)

In [13]:
reqs_with_call_stacks['stack_ps1s']

Series([], Name: stack_ps1s, dtype: object)

In [14]:
reqs_with_call_stacks[
    reqs_with_call_stacks.stack_ps1s.apply(lambda x: 'doubleclick.net' in x)
]
# TODO: only groupby if there are any matching requests
# .groupby('url_ps1').top_level_url.count().sort_values(ascending=False)

# Javascript Calls

In [15]:
# Load the data
table_name = 'javascript'
js = dataset(table_name).read_pandas().to_pandas()

In [16]:
# Add the public suffix + 1 of a bunch of the URL columns
js['script_ps1'] = js['script_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['top_ps1'] = js['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['document_ps1'] = js['document_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

#### Canvas font fingerprinting

In [17]:
font_shorthand = re.compile(r"^\s*(?=(?:(?:[-a-z]+\s*){0,2}(italic|oblique))?)(?=(?:(?:[-a-z]+\s*){0,2}(small-caps))?)(?=(?:(?:[-a-z]+\s*){0,2}(bold(?:er)?|lighter|[1-9]00))?)(?:(?:normal|\1|\2|\3)\s*){0,3}((?:xx?-)?(?:small|large)|medium|smaller|larger|[.\d]+(?:\%|in|[cem]m|ex|p[ctx]))(?:\s*\/\s*(normal|[.\d]+(?:\%|in|[cem]m|ex|p[ctx])))?\s*([-_\{\}\(\)\&!\',\*\.\"\sa-zA-Z0-9]+?)\s*$")

In [18]:
# Grab all of the canvas calls
js[
    js.symbol.str.startswith('CanvasRenderingContext2D')
]

Unnamed: 0,incognito,crawl_id,visit_id,extension_session_uuid,event_ordinal,page_scoped_event_ordinal,window_id,tab_id,frame_id,script_url,...,call_stack,symbol,operation,value,arguments,time_stamp,instance_id,script_ps1,top_ps1,document_ps1
4,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,9,4,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.textBaseline,set,top,,2019-07-15T05:09:29.729Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
5,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,10,5,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.font,set,600 32px Arial,,2019-07-15T05:09:29.730Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
8,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,13,8,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.fillText,call,,"[""🇺🇳"",0,0]",2019-07-15T05:09:29.733Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
12,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,17,12,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.fillText,call,,"[""🇺​🇳"",0,0]",2019-07-15T05:09:29.739Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
16,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,21,16,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.fillText,call,,"[""🏴󠁧󠁢󠁥󠁮󠁧󠁿"",0,0]",2019-07-15T05:09:29.741Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
20,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,25,20,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.fillText,call,,"[""🏴​󠁧​󠁢​󠁥​󠁮​󠁧​󠁿"",0,0]",2019-07-15T05:09:29.742Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
22,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,27,22,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.textBaseline,set,top,,2019-07-15T05:09:29.745Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
23,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,28,23,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.font,set,600 32px Arial,,2019-07-15T05:09:29.746Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
26,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,31,26,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.fillText,call,,"[""👨🏾‍🤝‍👨🏼"",0,0]",2019-07-15T05:09:29.748Z,-2022130726,princeton.edu,princeton.edu,princeton.edu
30,0,-388674837,3778318194027651,6e690347-1eef-428b-8901-bb8f61030139,35,30,1,1,0,https://citp.princeton.edu/,...,,CanvasRenderingContext2D.fillText,call,,"[""👨🏾​🤝​👨🏼"",0,0]",2019-07-15T05:09:29.751Z,-2022130726,princeton.edu,princeton.edu,princeton.edu


In [19]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1)
].groupby('script_ps1').top_ps1.count().sort_values(ascending=False)

Series([], Name: top_ps1, dtype: int64)

In [20]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'admicro.vn')
].arguments.apply(lambda x: json.loads(x)["0"]).unique()

array([], dtype=object)

In [21]:
js[
    (js.symbol == 'CanvasRenderingContext2D.font') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'admicro.vn')
].value.apply(lambda x: re.match(font_shorthand, x).group(6)).unique()

array([], dtype=object)