In [1]:
import json, time, sys
import tarfile
import os
import urllib2
import re
from cStringIO import StringIO
from pprint import pprint
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client

%pylab inline

data_url= 'https://analysis-output.telemetry.mozilla.org/beta_release_os_gfx/data'

# This dir is auto created when the script is scheduled
!mkdir -p output

## !!! EDIT THESE BEFORE RUNNING !!!
##
## These represent the number of uniform random samples of raw
## sessions we'll take, from each channel, as a fraction.
##
## When testing on small clusters, it is important to keep the
## fraction low. Using a cluster of size 4, without about 100,000
## pings, the job below took me about 10 minutes. The fraction
## was f=0.001. For iterative development, it is best to use even
## smaller fractions.
##
## As of Nov 2015, 1% (fraction=0.01) of sessions results in
## about 1,000,000 samples for Beta, and about 1/5th that for
## Release. After FHR is removed, the Release population will
## become much, much larger.
channels = {'beta': {'fraction': 0.3}, 'release': {'fraction': 0.3}}

Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/bucket-whitelist.json). Assuming all histograms are acceptable.
Populating the interactive namespace from numpy and matplotlib




In [2]:
windows_map = { (3, 5): 'NT',
                (4, 0): '95',
                (4, 1): '98',
                (4, 10): '98',
                (4, 9): 'Me',
                (4, 90): 'Me',
                (5, 0): '2000',
                (5, 1): 'XP',
                (5, 2): 'XP 64',
                (6, 0): 'Vista',
                (6, 1): '7',
                (6, 2): '8',
                (6, 3): '8.1',
                (10, 0): '10' }

mac_map = { 15: '10.11.x El Capitan',
            14: '10.10.x Yosemite',
            13: '10.9.x',
            12: '10.8.x',
            11: '10.7.x',
            10: '10.6.x',
            9: '10.5.x',
            8: '10.4.x' }

def get_prettyname(name, number):
    if name.startswith('Windows'):
        return get_prettyname_win(number)
    elif name.startswith('Darwin'):
        return get_prettyname_mac(number)
    elif name.startswith('Linux'):
        return get_prettyname_linux(number)
    else:
        return 'Unknown'

    
def get_prettyname_win(number):
    versions = number.split('.')
    if versions and versions[0].isdigit():
        major = int(versions[0])
        if len(versions) >= 2 and versions[1].isdigit():
            minor = int(versions[1])
        else:
            minor = 0
        return 'Windows %s' % windows_map.get((major, minor), 'unknown')
    return 'Windows unknown'


def get_prettyname_mac(number):
    versions = number.split('.')
    if versions and versions[0].isdigit():
        major = int(versions[0])
        return 'Mac OS X %s' % mac_map.get(major, 'unknown')
    return 'Mac OS X unknown'


def get_prettyname_linux(number):
    return 'Linux'


def get_base_os_name(name):
    if name.startswith('Windows'):
        return 'Windows'
    elif name.startswith('Darwin'):
        return 'Mac OS X'
    elif name.startswith('Linux'):
        return 'Linux'
    else:
        return 'Unknown'    

In [3]:
class pciids(object):

    __pciids = {}

    @staticmethod
    def __get_ids():
        if not pciids.__pciids:
            pciids.__get_data_from_pciids()
            pciids.__get_data_from_pcidatabase()
            pciids.__get_data_byhand()
        return pciids.__pciids

    
    @staticmethod
    def __get_data_from_pciids():
        url = 'https://raw.githubusercontent.com/pciutils/pciids/master/pci.ids'
        response = urllib2.urlopen(url)
        data = response.read()
        response.close()

        data = data.split('\n')
        
        # Syntax:
        # vendor  vendor_name
        #	device  device_name				<-- single tab
        #		subvendor subdevice  subsystem_name	<-- two tabs
        for line in data:
            if line and not line.startswith('#'):
                if not line.startswith('\t'): # vendor  vendor_name
                    line = line.strip()
                    pos = line.index('  ')
                    vendorID = line[:pos]
                    vendorName = line[(pos + 2):]
                    devices = {}
                    pciids.__pciids[vendorID] = { 'name': vendorName, 'devices': devices }
                elif not line.startswith('\t\t'): # device  device_name
                    line = line.strip()
                    pos = line.index('  ')
                    deviceID = line[:pos]
                    deviceName = line[(pos + 2):]
                    devices[deviceID] = deviceName
            elif line and line == '# List of known device classes, subclasses and programming interfaces':
                break

            
    @staticmethod
    def __get_data_from_pcidatabase():
        url = 'http://pcidatabase.com/reports.php?type=csv'
        response = urllib2.urlopen(url)
        data = response.read()
        response.close()

        #"0x0033","0x002F","Paradyne Corp.","MAIAM","Spitfire VGA Accelerator"
        pat = re.compile('\"0x([0-9a-fA-F]{4})\",\"0x([0-9a-fA-F]{4})\",\"([^\"]*)\",\"([^\"]*)\",\"([^\"]*)\"')

        data = data.split('\n')
        for line in data:
            if line:
                for m in pat.finditer(line):
                    vendorID = m.group(1).lower()
                    deviceID = m.group(2).lower()
                    vendorName = m.group(3)
                    desc1 = m.group(4)
                    desc2 = m.group(5)
                    desc = desc2 if desc2 else desc1
                    desc = desc if desc else '%s Unknown' % vendorName 
                    if vendorID in pciids.__pciids:
                        devices = pciids.__pciids[vendorID]['devices']
                        if deviceID not in devices:
                            devices[deviceID] = desc
                    else:
                        pciids.__pciids[vendorID] = { 'name': vendorName, 'devices': { deviceID: desc } }


    @staticmethod
    def __get_data_byhand():
        pciids.__pciids['8086']['devices']['22b1'] = 'Intel(R) HD Graphics'
        pciids.__pciids['8086']['devices']['0d22'] = 'Intel(R) Iris(TM) Pro Graphics 5200'
        pciids.__pciids['8086']['devices']['22b0'] = 'Intel(R) HD Graphics'
        pciids.__pciids['8086']['devices']['4102'] = 'Intel(R) Graphics Media Accelerator 600'
        pciids.__pciids['1022']['devices']['68f9'] = 'AMD Radeon Graphics Processor'
        pciids.__pciids['1002']['devices']['6920'] = 'AMD RADEON R9 M395X'
        pciids.__pciids['1002']['devices']['714f'] = 'RV505'
        pciids.__pciids['10de']['devices']['0a6b'] = 'NVIDIA 9400 GT 512 MB BIOS'
        pciids.__pciids['1414']['devices']['02c1'] = 'Microsoft RemoteFX Graphics Device - WDDM'
        pciids.__pciids['1414']['devices']['008c'] = 'Microsoft Basic Render Driver'
        pciids.__pciids['1414']['devices']['fefe'] = '0xfefe'


    @staticmethod
    def get_vendor_name(vendorID):
        vendor = pciids.__get_ids().get(vendorID, None)
        if vendor:
            return vendor['name']
        return 'Unknown'

    
    @staticmethod
    def get_pci_desc(vendorID, deviceID):
        vendor = pciids.__get_ids().get(vendorID, None)
        if vendor:
            desc = vendor['devices'].get(deviceID, None)
            if desc:
                return desc
            else:
                return '%s - Unknown' % vendor['name']
        else:
            return 'Unknown'


    @staticmethod
    def mk_id(pciid):
        if pciid.startswith('0x'):
            pciid = pciid[2:]
            l = len(pciid)
            if l < 4:
                pciid = '0' * (4 - l) + pciid
            elif l == 4:
                pciid = pciid.replace(' ', '0')
            else:
                return ''
            return pciid
        return ''

In [4]:
# Disable logging, since this eats a ton of diskspace.
def quiet_logs(sc):
  logger = sc._jvm.org.apache.log4j
  logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
  logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
quiet_logs(sc)

In [5]:
# Constants.
GfxAdaptersKey = 'environment/system/gfx/adapters'
SystemOsKey = 'environment/system/os'
SubmissionDate = 'meta/submissionDate'

In [6]:
def fmt_date(d):
    return d.strftime("%Y%m%d")


today = datetime.datetime.now()
start_date = fmt_date(today - datetime.timedelta(2))
end_date = start_date
#start_date = '20160411'
#end_date = '20160422'

def get_pings_for_channel(channel, fraction):
    date_range = (start_date, end_date)
    
    args = {
        'app': 'Firefox',
        'schema': 'v4',
        'submission_date': date_range,
        'channel': channel,
        'fraction': fraction,
    }
    
    pings = get_pings(sc, **args)
    pings = get_pings_properties(pings, [
        'clientId',
        GfxAdaptersKey,
        SystemOsKey,
        SubmissionDate
    ])
    pings = get_one_ping_per_client(pings)
    
    return pings.cache()

In [7]:
pings = {}
for channel, v in channels.iteritems():
    pings[channel] = get_pings_for_channel(channel, v['fraction'])

In [9]:
print('Found {0} beta pings.'.format(pings['beta'].count()))
print('Found {0} release pings.'.format(pings['release'].count()))

Found 7544 beta pings.
Found 107 release pings.


In [10]:
def add_1dict(x, y):
    for k, v in y.iteritems():
        x[k] = x[k] + v if k in x else v
    return x

def add_2dict(x, y):
    for k, v in y.iteritems():
        x[k] = add_1dict(x[k], v) if k in x else v
    return x

def get_os(p):
    os = p[SystemOsKey]
    return (os['name'], { os['version']: 1 })

def get_gfx(p):
    gfx = p[GfxAdaptersKey]
    unk = 'Unknown'
    if gfx:
        vendorID = gfx[0].get('vendorID', unk)
        deviceID = gfx[0].get('deviceID', unk)
        driverVersion = gfx[0].get('driverVersion', unk)   
        return (vendorID, { deviceID: { driverVersion: 1} })
    return (unk, unk)

def get_stats(pings, action, add):
    return pings.map(action).reduceByKey(add).collect()

def total(d):
    s = 0
    for v in d.itervalues():
        s += v
    return s

def reduce_version(v):
    _v = v.split('.')
    if len(_v) >= 2 and _v[0].isdigit() and _v[1].isdigit():
        return '%s.%s' % (_v[0], _v[1])
    else:
        return v


def improve_os(oses):
    _oses = {}
    n = 0
    for os in oses:
        os_name = os[0]
        versions = os[1]
        t = total(versions)

        base_name = get_base_os_name(os_name)
        if base_name in _oses:
            os = _oses[base_name]
            os['total'] += t
            _versions = os['versions']
        else:
            _versions = {} 
            _oses[base_name] = { 'total': t, 'versions': _versions }

        for version, value in versions.iteritems():
            name = get_prettyname(os_name, version)
            _versions[name] = _versions[name] + value if name in versions else value
        
        n += t
        
    return { 'total': n, 'oses': _oses}

def improve_gfx(gfxs):
    _gfxs = {}
    r = 0
    for gfx in gfxs:
        vendorID = gfx[0]
        vendorName = pciids.get_vendor_name(pciids.mk_id(vendorID))
        if vendorName in _gfxs:
            _devices = _gfxs[vendorName]['devices']
        else:
            _devices = {}
            _gfxs[vendorName] = { 'total': 0, 'devices': _devices }
        
        devices = gfx[1]
        s = 0
        for deviceID, versions in devices.iteritems():
            t = total(versions)
            s += t
            deviceName = pciids.get_pci_desc(pciids.mk_id(vendorID), pciids.mk_id(deviceID))
            if deviceName in _devices:
                device = _devices[deviceName]
                _versions = device['versions']
                device['total'] += t
            else:
                _versions = {}
                _devices[deviceName] = { 'total': t, 'versions': _versions }
            
            for version, value in versions.iteritems():
                #version = reduce_version(version)
                _versions[version] = _versions[version] + value if version in _versions else value
                
        _gfxs[vendorName]['total'] += s
        r += s
    
    return { 'total': r, 'gfxs': _gfxs}


In [11]:
os_data = { }
for channel, p in pings.iteritems():
    data = get_stats(p, get_os, add_1dict)
    data = improve_os(data)
    os_data[channel] = data

In [12]:
gfx_data = { }
for channel, p in pings.iteritems():
    data = get_stats(p, get_gfx, add_2dict)
    data = improve_gfx(data)
    gfx_data[channel] = data

In [23]:
data = { 'start_date': start_date, 'end_date': end_date, 'platforms': os_data, 'graphics': gfx_data }

In [71]:
#pprint(data['graphics'])

In [24]:
def aggregate(agg, data):
    agg['end_date'] = data['end_date']
    
    # we aggregate the os
    for channel, data_os in data['platforms'].iteritems():
        agg_os = agg['platforms'][channel]
        agg_oses = agg_os['oses']
        data_oses = data_os['oses']
        agg_os['total'] += data_os['total']
        for osName, versions in data_oses.iteritems():
            if osName in agg_oses:
                agg_versions = agg_oses[osName]
                agg_versions['total'] += versions['total']
                add_1dict(agg_versions['versions'], versions['versions'])
            else:
                agg_oses[osName] = versions
                
    # we aggregate the gfxs
    for channel, data_gfx in data['graphics'].iteritems():
        agg_gfx = agg['graphics'][channel]
        agg_gfxs = agg_gfx['gfxs']
        data_gfxs = data_gfx['gfxs']
        agg_gfx['total'] += data_gfx['total']
        for vendorName, devices in data_gfxs.iteritems():
            if vendorName in agg_gfxs:
                agg_devices = agg_gfxs[vendorName]
                agg_devices['total'] += devices['total']
                agg_devices = agg_devices['devices']
                devices = devices['devices']
                for deviceName, versions in devices.iteritems():
                    if deviceName in agg_devices:
                        agg_versions = agg_devices[deviceName]
                        agg_versions['total'] += versions['total']
                        add_1dict(agg_versions['versions'], versions['versions'])
                    else:
                        agg_versions[deviceName] = versions
            else:
                agg_gfxs[vendorName] = devices

#{'beta': {'gfxs': {'Advanced Micro Devices, Inc. [AMD/ATI]': {'devices': {'BeaverCreek [Radeon HD 6520G]': {'total': 1,
#                                                                                                            'versions': {u'8.901': 1}}

In [31]:
try:
    response = urllib2.urlopen('%s/agg_data.tar.gz' % data_url)
    tgz = response.read()
    response.close()
except:
    tgz = None

if tgz:
    tar = tarfile.open(mode='r:gz', fileobj=StringIO(tgz))
    tar.extractall()
    tar.close()
    with open('agg_data.json') as In:
        agg_data = json.load(In)
        aggregate(agg_data, data)
    with open('agg_data.json', 'w') as Out:
        json.dump(agg_data, Out)
else:
    with open('agg_data.json', 'w') as Out:
        json.dump(data, Out)
    
tar = tarfile.open('./output/agg_data.tar.gz', 'w:gz', compresslevel=9)
tar.add('agg_data.json', arcname='agg_data.json')
tar.close()
os.remove('agg_data.json')