### Load and have a quick look at the data with pandas, dask and spark


* [Pandas](#Pandas)
* [Dask](#Dask)
* [Spark](#Spark)

In [1]:
import tldextract

DATA_DIR = 'path to where you have extracted the data'
PARQUET_FILE = DATA_DIR + 'sample'  # I ran this with sample data*

def extract_domain(url):
    """Use tldextract to return the base domain from a url"""
    try:
        extracted = tldextract.extract(url)
        return '{}.{}'.format(extracted.domain, extracted.suffix)
    except Exception as e:
        return 'ERROR'

<small>*could also be run with full data (but beware of pandas blowing up RAM)</small>


# Pandas

http://pandas.pydata.org/

In [2]:
import pandas as pd

In [3]:
df = pd.read_parquet(PARQUET_FILE, engine='pyarrow')
df.head()

Unnamed: 0,argument_0,argument_1,argument_2,argument_3,argument_4,argument_5,argument_6,argument_7,argument_8,arguments,...,script_line,script_loc_eval,script_url,symbol,time_stamp,value,value_1000,value_len,valid,errors
0,,,,,,,,,,{},...,57,,https://staticxx.facebook.com/connect/xd_arbit...,window.name,2017-12-16 02:54:10.079,fb_xdm_frame_https,fb_xdm_frame_https,18,True,
1,,,,,,,,,,{},...,57,,https://staticxx.facebook.com/connect/xd_arbit...,window.name,2017-12-16 02:54:10.080,fb_xdm_frame_https,fb_xdm_frame_https,18,True,
2,,,,,,,,,,{},...,57,,https://staticxx.facebook.com/connect/xd_arbit...,window.document.cookie,2017-12-16 02:54:10.086,,,0,True,
3,,,,,,,,,,{},...,49,,https://staticxx.facebook.com/connect/xd_arbit...,window.navigator.userAgent,2017-12-16 02:54:10.088,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68,True,
4,,,,,,,,,,{},...,25,,https://ajax.googleapis.com/ajax/libs/webfont/...,window.navigator.userAgent,2017-12-16 07:12:07.104,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68,True,


In [4]:
df.columns

Index(['argument_0', 'argument_1', 'argument_2', 'argument_3', 'argument_4',
       'argument_5', 'argument_6', 'argument_7', 'argument_8', 'arguments',
       'arguments_n_keys', 'call_id', 'call_stack', 'crawl_id', 'file_name',
       'func_name', 'in_iframe', 'location', 'operation', 'script_col',
       'script_line', 'script_loc_eval', 'script_url', 'symbol', 'time_stamp',
       'value', 'value_1000', 'value_len', 'valid', 'errors'],
      dtype='object')

In [5]:
df['location_domain'] = df.location.apply(extract_domain)
df['script_domain'] = df.script_url.apply(extract_domain)

In [6]:
df.location_domain.value_counts().head()

gap.com               736
officeworks.com.au    708
ufc.ca                518
doubleclick.net       517
disqus.com            479
Name: location_domain, dtype: int64

In [7]:
df.script_domain.value_counts().head(10)

google-analytics.com    1317
optimizely.com           573
yandex.ru                422
baidu.com                416
doubleclick.net          407
cloudfront.net           385
moatads.com              287
disquscdn.com            273
fbcdn.net                256
gap.com                  233
Name: script_domain, dtype: int64

How many domains have google analytics and yandex.ru?

In [8]:
google_analytics = df[df.script_domain == 'google-analytics.com']
yandex = df[df.script_domain == 'yandex.ru']

In [9]:
for location_domain in google_analytics.location_domain.unique():
    if location_domain in list(yandex.location_domain.unique()):
        print(location_domain)

vjav.com
newchic.com
zona.mobi


# Dask

http://dask.pydata.org/

In [11]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

In [12]:
ddf = dd.read_parquet(PARQUET_FILE, engine='pyarrow')
ddf.head()

Unnamed: 0,argument_0,argument_1,argument_2,argument_3,argument_4,argument_5,argument_6,argument_7,argument_8,arguments,...,script_line,script_loc_eval,script_url,symbol,time_stamp,value,value_1000,value_len,valid,errors
0,,,,,,,,,,{},...,57,,https://staticxx.facebook.com/connect/xd_arbit...,window.name,2017-12-16 02:54:10.079,fb_xdm_frame_https,fb_xdm_frame_https,18,True,
1,,,,,,,,,,{},...,57,,https://staticxx.facebook.com/connect/xd_arbit...,window.name,2017-12-16 02:54:10.080,fb_xdm_frame_https,fb_xdm_frame_https,18,True,
2,,,,,,,,,,{},...,57,,https://staticxx.facebook.com/connect/xd_arbit...,window.document.cookie,2017-12-16 02:54:10.086,,,0,True,
3,,,,,,,,,,{},...,49,,https://staticxx.facebook.com/connect/xd_arbit...,window.navigator.userAgent,2017-12-16 02:54:10.088,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68,True,
4,,,,,,,,,,{},...,25,,https://ajax.googleapis.com/ajax/libs/webfont/...,window.navigator.userAgent,2017-12-16 07:12:07.104,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68,True,


In [13]:
ddf.columns

Index(['argument_0', 'argument_1', 'argument_2', 'argument_3', 'argument_4',
       'argument_5', 'argument_6', 'argument_7', 'argument_8', 'arguments',
       'arguments_n_keys', 'call_id', 'call_stack', 'crawl_id', 'file_name',
       'func_name', 'in_iframe', 'location', 'operation', 'script_col',
       'script_line', 'script_loc_eval', 'script_url', 'symbol', 'time_stamp',
       'value', 'value_1000', 'value_len', 'valid', 'errors'],
      dtype='object')

What are people putting in the canvas

In [14]:
ddf['location_domain'] = ddf.location.apply(extract_domain, meta=('x', 'str'))
ddf['script_domain'] = ddf.script_url.apply(extract_domain, meta=('x', 'str'))

In [15]:
fillTexts = ddf[ddf.symbol == 'CanvasRenderingContext2D.fillText']

with ProgressBar():
    fillTexts = fillTexts.compute()

[########################################] | 100% Completed |  0.4s


In [16]:
# What's being written to canvas
pd.DataFrame(fillTexts.argument_0.value_counts())

Unnamed: 0,argument_0
ð,8
ð§ââï¸,2
ð§ââï¸,2
45,2
38,2
ðºð³,2
"ClientJS,org <canvas> 1.0",2
ðºâð³,2
!H71JCaj)]# 1@#,1
Soft Ruddy Foothold 2,1


In [17]:
# How does it vary by domain?
pd.DataFrame(fillTexts.groupby(['location_domain', 'script_domain', 'argument_0']).size())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
location_domain,script_domain,argument_0,Unnamed: 3_level_1
bongacams.com,bongacams.com,ðºâð³,1
bongacams.com,bongacams.com,ðºð³,1
bongacams.com,bongacams.com,ð§ââï¸,1
bongacams.com,bongacams.com,ð§ââï¸,1
facebook.com,fbcdn.net,38,2
facebook.com,fbcdn.net,45,2
facebook.com,fbcdn.net,ð,8
gap.com,gap.com,!H71JCaj)]# 1@#,1
gap.com,gap.com,Soft Ruddy Foothold 2,1
syracuse.edu,syracuse.edu,ðºâð³,1


# Spark

https://spark.apache.org/docs/latest/api/python/pyspark.html

We use [findspark](https://github.com/minrk/findspark) to set up spark.

In [18]:
import findspark

findspark.init('/opt/spark')  # Adjust for the location where you installed spark

from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext(appName="Overscripted")
spark = SparkSession(sc)

### Read in parquet and display a row

In [19]:
sdf = spark.read.parquet(DATA_DIR + 'sample')
sdf.show(1, vertical=True, truncate=False)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------
 argument_0       |                                                                                                                                           
 argument_1       |                                                                                                                                           
 argument_2       |                                                                                                                                           
 argument_3       |                                                                                                                                           
 argument_4       |                                                                                                                                           
 argument_5       |                           

### Get the distinct symbols and show highest counts

In [20]:
sdf.select('symbol').distinct().show(truncate=False)

+-------------------------------------------------------------+
|symbol                                                       |
+-------------------------------------------------------------+
|window.navigator.appVersion                                  |
|window.navigator.product                                     |
|window.screen.colorDepth                                     |
|RTCPeerConnection.createOffer                                |
|RTCPeerConnection.localDescription                           |
|CanvasRenderingContext2D.font                                |
|HTMLCanvasElement.nodeType                                   |
|CanvasRenderingContext2D.createRadialGradient                |
|RTCPeerConnection.onicecandidate                             |
|window.navigator.buildID                                     |
|CanvasRenderingContext2D.scale                               |
|window.navigator.mimeTypes[application/futuresplash].suffixes|
|RTCPeerConnection.setLocalDescription  

In [21]:
sdf.groupBy('symbol').count().sort('count', ascending=False).show(truncate=False)

+-----------------------------------------------------+-----+
|symbol                                               |count|
+-----------------------------------------------------+-----+
|window.document.cookie                               |3390 |
|window.navigator.userAgent                           |1797 |
|window.Storage.getItem                               |807  |
|window.localStorage                                  |442  |
|window.Storage.setItem                               |363  |
|window.navigator.plugins[Shockwave Flash].description|238  |
|window.sessionStorage                                |236  |
|window.Storage.removeItem                            |185  |
|window.name                                          |185  |
|window.screen.colorDepth                             |166  |
|window.navigator.appName                             |135  |
|window.navigator.platform                            |120  |
|window.navigator.language                            |96   |
|window.