In [6]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import urllib2
import datetime
import sys

from moztelemetry.spark import get_pings, get_pings_properties
from moztelemetry.dataset import Dataset
import moztelemetry.spark

%pylab inline

from operator import add

Populating the interactive namespace from numpy and matplotlib


In [2]:
sc.defaultParallelism

32

### Extract a working dataset

Collect nightly data from builds dated within a 2-week window.

In [3]:
def fmt_date(d):
    return d.strftime("%Y%m%d")

## Dates bounding the time window to look at.
t1 = fmt_date(datetime.datetime.now() - datetime.timedelta(16)) # go back 16 days
t2 = fmt_date(datetime.datetime.now() - datetime.timedelta(2)) # go back 2 days
t1, t2

('20161115', '20161129')

In [12]:
## Collect only saved-session pings (which cover full browser sessions rather than the usual subsessions).

%time pings = Dataset.from_source("telemetry")\
                     .where(docType="saved_session")\
                     .where(appName="Firefox")\
                     .where(appUpdateChannel="nightly")\
                     .where(appBuildId=lambda b: b >= t1 and b <= t2)\
                     .records(sc)

CPU times: user 3.68 s, sys: 2.17 s, total: 5.85 s
Wall time: 38.8 s


In [14]:
def parseAddons(addons):
    """ Create a list of enabled add-ons with elements of the form (ID, version). """
    return[(k, v.get("version")) for k, v in addons.iteritems()]

def extract(ping):
    """ Extract relevant fields from each payload.
    
        shims: Reason why add-on shims were used, keyed by add-on ID (enumerated count of reason codes)
        cpowTime: Contiguous time spent by an add-on blocking the main loop by performing a blocking
                  cross-process call (microseconds, keyed by add-on ID).
        cpowForbidden: Number of times an add-on used CPOWs when it was marked as e10s compatible
                       (count, keyed by add-on ID).
        addons: List of (ID, version) for each enabled add-on.
    """
    payload = ping.get("payload", {})
    env = ping.get("environment", {})
    if not payload or not env:
        return None
    hists = payload.get("histograms", {})
    keyed = payload.get("keyedHistograms", {})
    return {
        "clientId": ping.get("clientId", None),
        #"os": env.get("system", {}).get("os", {}).get("name", None),
        #"e10s": env.get("settings", {}).get("e10sEnabled", None),
        "sessionLength": payload.get("info", {}).get("sessionLength", -1), ## in seconds
        "shims": keyed.get("ADDON_SHIM_USAGE", {}),
        "cpowTime": keyed.get("PERF_MONITORING_SLOW_ADDON_CPOW_US", {}),
        "cpowForbidden": keyed.get("ADDON_FORBIDDEN_CPOW_USAGE", {}),
        "addons": parseAddons(env.get("addons", {}).get("activeAddons", {}))
    }

def is_relevant(ping):
    """ Relevant clients are those that have enabled add-ons, and a valid session length measurement. """
    return (bool(ping) and \
            bool(ping["addons"]) and \
            ping["sessionLength"] > 0)

## Extract relevant data, and restrict to clients that have add-ons.
bySession = pings.map(extract)\
    .filter(is_relevant)\
    .persist(StorageLevel.MEMORY_AND_DISK_SER)

The `bySession` dataset has one record per client session which had enabled add-ons.

How many session pings are in the dataset?

In [15]:
%time bySession.count()

CPU times: user 28 ms, sys: 12 ms, total: 40 ms
Wall time: 7min 15s


912202

How many unique clients do these come from?

In [16]:
bySession.map(lambda p: p["clientId"]).distinct().count()

54298

How many add-ons are represented in the dataset, and what are the top few?

In [17]:
addonCounts = bySession.flatMap(lambda p: [(guid, p["clientId"]) for (guid, version) in p["addons"]])\
    .distinct()\
    .map(lambda (guid, clientid): guid)\
    .countByValue()
len(addonCounts)

7335

In [18]:
sorted(addonCounts.items(), key = lambda (guid, count): (-count, guid))[:20]

[(u'aushelper@mozilla.org', 54201),
 (u'formautofill@mozilla.org', 54174),
 (u'flyweb@mozilla.org', 54088),
 (u'webcompat@mozilla.org', 54034),
 (u'e10srollout@mozilla.org', 53953),
 (u'firefox@getpocket.com', 53780),
 (u'presentation@mozilla.org', 47975),
 (u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', 10246),
 (u'uBlock0@raymondhill.net', 7011),
 (u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', 3446),
 (u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', 2617),
 (u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', 1940),
 (u'{73a6fe31-595d-460b-a920-fcc0f8843232}', 1747),
 (u'firebug@software.joehewitt.com', 1613),
 (u'{46551EC9-40F0-4e47-8E18-8E5CF550CFB8}', 1522),
 (u'firefox@mega.co.nz', 1452),
 (u'@testpilot-addon', 1414),
 (u'firefox@ghostery.com', 1308),
 (u'{b9bfaf1c-a63f-47cd-8b9a-29526ced9060}', 1228),
 (u'support@lastpass.com', 1209)]

Restrict computations to a set of add-ons (GUID/version pairs) that have enough data to draw reasonable measurements. We consider add-ons that are installed in at least 50 profiles, and have been active during a combined session time amounting to at least 1 hour in length.

In [19]:
## Get a count of unique profiles for each (GUID, version) pair.
addonInstalls = bySession.flatMap(lambda p: [(addon, p["clientId"]) for addon in p["addons"]])\
    .distinct()\
    .map(lambda (addon, clientId): (addon, 1))\
    .reduceByKey(add)

## Get total session time for each (GUID, version) pair.
addonTime = bySession.flatMap(lambda p: [(addon, p["sessionLength"]) for addon in p["addons"]])\
    .reduceByKey(add)

addonStats = addonInstalls.join(addonTime).\
    filter(lambda (addon, (nInstalls, totalTime)): nInstalls >= 50 and totalTime >= 3600)

How many add-ons (split by version) does this leave?

In [20]:
addonStats.count()

533

What are the top (GUID, version) pairs (by installs)?

In [21]:
addonStats.sortBy(lambda (addon, (nInstalls, totalTime)): -nInstalls).take(20)

[((u'aushelper@mozilla.org', u'1.0'), (54201, 8491729555)),
 ((u'formautofill@mozilla.org', u'1.0'), (54174, 8484904045)),
 ((u'flyweb@mozilla.org', u'1.0.0'), (54088, 8471076088)),
 ((u'webcompat@mozilla.org', u'1.0'), (54034, 8471245627)),
 ((u'e10srollout@mozilla.org', u'1.6'), (53939, 8455384557)),
 ((u'firefox@getpocket.com', u'1.0.5'), (53778, 8419034304)),
 ((u'presentation@mozilla.org', u'1.0.0'), (47975, 6291918971)),
 ((u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', u'2.8.1'), (8273, 932456306)),
 ((u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', u'2.8.2'), (7735, 1004019367)),
 ((u'uBlock0@raymondhill.net', u'1.9.16'), (6624, 1325904284)),
 ((u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', u'6.1.1'), (3316, 532710605)),
 ((u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', u'3.9'), (2521, 655582571)),
 ((u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', u'3.0.8'), (1898, 405692279)),
 ((u'uBlock0@raymondhill.net', u'1.10.0'), (1686, 92988288)),
 ((u'firebug@software.joehewitt.com', u'2.0.18'), (1492, 27

### Shims

Count enabled add-on installs (ID and version), together with whether or not they were observed to use shims.

An add-on is counted as using shims if it has entry in the `ADDON_SHIM_USAGE` keyed histogram for at least one client session (regardless of the values in the histogram). This histogram records shim usage occurrence by the [reason it was used](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/addoncompat/CompatWarning.jsm#94).

In [22]:
## Summarize shim usage per add-on/client.
## Reduce multiple sessions observed for each client to
## a single entry of the form ((ID, version), clientID, usedShims).

def getShimData(d):
    """ Summarize each add-on in the ping as (((GUID, version), clientID), <used shims?>). """
    return [((addonv, d["clientId"]), addonv[0] in d["shims"]) for addonv in d["addons"]]

shimUsageByClient = bySession.flatMap(getShimData)\
    .reduceByKey(lambda a, b: a or b)\
    .map(lambda ((addon, clientId), usedShims): (addon, clientId, usedShims))

In [23]:
## Compute number of clients that used shims for each add-on.
## Result is of the form ((ID, version), usedShims, # clients).

shimUsageCounts = shimUsageByClient\
    .map(lambda (addon, clientId, usedShims): ((addon, usedShims), 1))\
    .reduceByKey(add)\
    .map(lambda ((addon, usedShims), count): (addon, usedShims, count))

In [24]:
## For each (add-on, version) pair, determine whether it ever used shims,
## along with its overall installation count.
## Result is of the form
##  ((ID, version), <used shims in at least one client session>, overall # installations).

shimUsageByAddon = shimUsageCounts\
    .map(lambda (addon, usedShims, count): (addon, (usedShims, count)))\
    .reduceByKey(lambda (s1, c1), (s2, c2): (s1 or s2, c1 + c2))\
    .map(lambda (addon, (usedShims, count)): (addon, usedShims, count))

Sanity check: how many add-on (GUID, version) pairs do we have?

In [25]:
shimUsageByAddon.count()

10007

Restrict the final shim usage dataset to the set of add-ons we are interested in.

In [26]:
shimUsageFiltered = shimUsageByAddon\
    .map(lambda (addon, usedShims, count): (addon, (usedShims, count)))\
    .join(addonStats)\
    .map(lambda (addon, (shimData, stats)): (addon,) + shimData)\
    .collect()

## Order by decreasing installation count.
shimUsageFiltered.sort(key = lambda v: (-v[-1], v[:-1]))

How many add-ons are in the final dataset?

In [27]:
len(shimUsageFiltered)

533

How many of these used shims?

In [28]:
shimUsageShimmed = filter(lambda (addon, usedShims, count): usedShims, shimUsageFiltered)
len(shimUsageShimmed)

225

Dump results to a JSON file that will be used in the HTML page.

The file is one big JSON array with elements of the form `[<numInstallations>, [<GUID>, <version>], <usedShims>]`.

In [29]:
def formatForJSON(d):
    return (d[-1],) + d[:-1]

shimUsageOutput = map(formatForJSON, shimUsageFiltered)

with open('shim-data.json', 'w') as f:
    json.dump(shimUsageOutput, f)

The shim usage data, orderd by decreasing installation count.

In [30]:
shimUsageFiltered

[((u'aushelper@mozilla.org', u'1.0'), False, 54201),
 ((u'formautofill@mozilla.org', u'1.0'), False, 54174),
 ((u'flyweb@mozilla.org', u'1.0.0'), False, 54088),
 ((u'webcompat@mozilla.org', u'1.0'), False, 54034),
 ((u'e10srollout@mozilla.org', u'1.6'), False, 53939),
 ((u'firefox@getpocket.com', u'1.0.5'), True, 53778),
 ((u'presentation@mozilla.org', u'1.0.0'), False, 47975),
 ((u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', u'2.8.1'), True, 8273),
 ((u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', u'2.8.2'), True, 7735),
 ((u'uBlock0@raymondhill.net', u'1.9.16'), True, 6624),
 ((u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', u'6.1.1'), True, 3316),
 ((u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', u'3.9'), True, 2521),
 ((u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', u'3.0.8'), False, 1898),
 ((u'uBlock0@raymondhill.net', u'1.10.0'), True, 1686),
 ((u'firebug@software.joehewitt.com', u'2.0.18'), False, 1492),
 ((u'{46551EC9-40F0-4e47-8E18-8E5CF550CFB8}', u'2.0.7'), False, 1468),
 ((u'firefox@ghostery.c

### CPOWs

CPOW usage is recorded in the `PERF_MONITORING_SLOW_ADDON_CPOW_US` histogram as time in microseconds spent by an add-on blocking the main loop using a CPOW.

Summarize CPOW usage for each enabled add-on (ID and version) by:

- average number of microseconds per CPOW blocking occurrence
- average number of blocking occurrences per hour of session time

Note that in many cases an add-on has an entry in the histogram, but it only has observations with the value 0. It appears that the histogram is automatically recorded at the same time as `PERF_MONITORING_SLOW_ADDON_JANK_US` by the [AddonWatcher](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/perfmonitoring/AddonWatcher.jsm#128), and so may not have any CPOW blocking to report at that time. This is handled by dropping observations in the histograms' '0' bucket.

In [31]:
## Summarize CPOW usage per add-on, returning entries of the form
## ((GUID, version), { 
##    "totalTime" : <total session time in seconds with this add-on>,
##    "numOccurrences": <total number of times add-on CPOW blocked main loop>,
##    "totalCPOWTime": <total blocking time for add-on CPOWs>
## })

def getCPOWData(d):
    """ Summarize CPOW data for each add-on/session as a list of
            ((GUID, version), {totalTime, numOccurrences, totalCPOWTime}).
    """
    result = []
    for addonv in d["addons"]:
        data = {
            "totalTime": d["sessionLength"],
            "numOccurrences": 0,
            "totalCPOWTime": 0
        }
        cpowData = d["cpowTime"].get(addonv[0])
        
        ## If the histogram is present, but all values are 0, ignore it completely.
        if cpowData and cpowData["sum"] > 0:
            ## Some of the CPOW values may be 0 - ignore those.
            data["numOccurrences"] = sum([n for v, n in cpowData["values"].items() if v != "0"])
            data["totalCPOWTime"] = cpowData["sum"]
        result.append((addonv, data))
    return result

def dictSum(a, b):
    """ Add up like entries between two dicts. """
    result = {}
    for k in a:
        result[k] = a[k] + b[k]
    return result

cpowBySession = bySession.flatMap(getCPOWData).reduceByKey(dictSum)

Restrict the final CPOW dataset to the set of add-ons we are interested in.

In [32]:
cpowFiltered = cpowBySession\
    .join(addonStats)\
    .map(lambda (addon, (cpowData, stats)): (addon, cpowData))

In [33]:
## Summarize add-on CPOW usage with:
## - hadCPOWBlocking: were there any CPOW blocking occurrences?
## - avgBlockingTime: the average blocking time spent per occurrence (truncated to the nearest microsecond)
## - occurrenceFreq: the average number of blocking occurrences per session hour.

def summaryCPOWTime(d):
    sessionHours = float(d["totalTime"]) / 3600
    hadCPOWBlocking = d["numOccurrences"] > 0
    return {
        "hadCPOWBlocking": hadCPOWBlocking,
        ## Since these are microseconds anyway, truncate using integer division.
        "avgBlockingTime": d["totalCPOWTime"] / d["numOccurrences"] if hadCPOWBlocking else 0,
        "occurrenceFreq": float(d["numOccurrences"]) / sessionHours if hadCPOWBlocking else 0
    }

cpowSummary = cpowFiltered.mapValues(summaryCPOWTime).collect()

How many add-ons are left in the dataset?

In [34]:
len(cpowSummary)

533

How many of these had blocking CPOWs?

In [35]:
cpowHadBlocking = filter(lambda (addon, summary): summary["hadCPOWBlocking"], cpowSummary)
len(cpowHadBlocking)

268

Dump data for add-ons that had CPOW blocking to a JSON file that will be used in the HTML page.

The file is one big JSON array with elements of the form `[[<GUID>, <version>], <avgBlockingTime>, <occurrenceFreq>]`.

In [36]:
def formatForJSON((addon, summary)):
    return (addon, summary["avgBlockingTime"], summary["occurrenceFreq"])

## Order by decreasing average blocking time.
cpowOutput = map(formatForJSON, cpowSummary)
cpowOutput.sort(key = lambda (addon, avg, freq): -avg)


with open('cpow-data.json', 'w') as f:
    json.dump(cpowOutput, f)

The blocking CPOW data, ordered by decreasing average blocking time.

In [68]:
sorted(cpowHadBlocking, key = lambda (addon, summary): summary["avgBlockingTime"], reverse = True)

[((u'{888d99e7-e8b5-46a3-851e-1ec45da1e644}', u'45.0.0'),
  {'avgBlockingTime': 3042178L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.22416210313137333}),
 ((u'{53A03D43-5363-4669-8190-99061B2DEBA5}', u'1.5.14'),
  {'avgBlockingTime': 2354678,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.023873173240853442}),
 ((u'printedit@DW-dev', u'17.2'),
  {'avgBlockingTime': 543255,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.007755238384846456}),
 ((u'save-as-pdf-ff@pdfcrowd.com', u'1.5.1-signed.1-signed'),
  {'avgBlockingTime': 541518,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.0059135096849377514}),
 ((u'{e8deb9e5-5688-4655-838a-b7a121a9f16e}', u'48.2'),
  {'avgBlockingTime': 447617,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 2.6191022658962115}),
 ((u'vk@sergeykolosov.mp', u'0.3.9.5'),
  {'avgBlockingTime': 440312L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 17.187662224843848}),
 ((u'{cd617375-6743-4ee8-bac4-fbf10f35729e}', u'2.9.6'),
  {'avgBlockingTi

The blocking CPOW data, ordered by decreasing occurrence frequency.

In [69]:
sorted(cpowHadBlocking, key = lambda (addon, summary): summary["occurrenceFreq"], reverse = True)

[((u'{4ED1F68A-5463-4931-9384-8FFF5ED91D92}', u'5.0.226.0'),
  {'avgBlockingTime': 172276L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 224.70736422914163}),
 ((u'{4ED1F68A-5463-4931-9384-8FFF5ED91D92}', u'5.0.248.0'),
  {'avgBlockingTime': 148177L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 215.10929652587956}),
 ((u'thumbnailZoom@dadler.github.com', u'4.0'),
  {'avgBlockingTime': 153362L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 89.86026368995248}),
 ((u'paulsaintuzb@gmail.com', u'8.2.1'),
  {'avgBlockingTime': 313362L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 58.33050616409258}),
 ((u'{19503e42-ca3c-4c27-b1e2-9cdb2170ee34}', u'1.5.6.13'),
  {'avgBlockingTime': 311666L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 53.30599978094404}),
 ((u'artur.dubovoy@gmail.com', u'13.2.4'),
  {'avgBlockingTime': 286156L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 50.589353902683094}),
 ((u'{73a6fe31-595d-460b-a920-fcc0f8843232}', u'2.9.0.14rc1'),
  {'avgBlockin

Finally, push the output files to S3.

In [37]:
import boto3
from boto3.s3.transfer import S3Transfer

data_bucket = "telemetry-public-analysis-2"
s3path = "e10s-addon-perf-2/data"

client = boto3.client('s3', 'us-west-2')
transfer = S3Transfer(client)

for f in ("shim-data.json", "cpow-data.json"):
    transfer.upload_file(f, data_bucket, "{}/{}".format(s3path, f))