# main_summary example

In [1]:
## load the data
mainpingspq = sqlContext.read.load("s3://telemetry-parquet/main_summary/v3", "parquet")

In [2]:
## run a filter to get Firefox data from 1% of DAI using release candidate
u1 = mainpingspq.filter(mainpingspq.app_name == 'Firefox').filter(mainpingspq.sample_id=='42').filter(mainpingspq.normalized_channel == 'release')

In [3]:
mainpingspq.isLocal()

False

In [None]:
## take an example
u1.take(1)

In [4]:
## It is always a good idea (VERY GOOD IDEA) to select only columns you need e.g.
u2 = u1.select(u1.app_name,u1.client_id, u1.subsession_start_date)

In [5]:
## Get a sample
u2.take(1)

KeyboardInterrupt: 

In [5]:
## get the type of u2
type(u2)
## pyspark.sql.dataframe.DataFrame

pyspark.sql.dataframe.DataFrame

In [6]:
## get column names
u2.columns
## ['app_name', 'client_id', 'subsession_start_date']

['app_name', 'client_id', 'subsession_start_date']

## Register Spark DF as a SQL table

In [3]:
## You can then register a Spark Data Frame as a SqL table
## and then do SQL queries on it

sqlContext.registerDataFrameAsTable(u1,"U1")

In [15]:
u2 = sqlContext.sql("""
select
count(distinct client_id) as num_users
from U1
where app_name = 'Firefox'
and sample_id = '42'
""")

In [16]:
u2.collect()

[Row(num_users=6279237)]

In [17]:
u3 = sqlContext.sql("""
select count(distinct client_id) as num_users,
os
from U1
where app_name = 'Firefox'
and sample_id = '42'
group by os
order by 1 desc
limit 5
""")

In [18]:
u3.collect()

[Row(num_users=5660534, os=u'Windows_NT'),
 Row(num_users=319862, os=u'Linux'),
 Row(num_users=298672, os=u'Darwin'),
 Row(num_users=798, os=u'Windows_95'),
 Row(num_users=246, os=u'Windows_98')]

In [6]:
u1.select(u1.client_id, u1.os).distinct().groupBy("os").count().orderBy("count", ascending=0).show()

+------------------+-------+
|                os|  count|
+------------------+-------+
|        Windows_NT|5660534|
|             Linux| 319862|
|            Darwin| 298672|
|        Windows_95|    798|
|        Windows_98|    246|
|              null|      2|
|[object Generator]|      2|
|        Wiｮdows_NT|      1|
|      GNU/kFreeBSD|      1|
|   Windows_Unknown|      1|
|         10.4.0.20|      1|
+------------------+-------+



In [9]:
from pyspark.sql.functions import *