In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from user_agents import parse

In [2]:
spark = SparkSession.builder.appName('4_1').master('local[2]').config('driver.memory', '8g').getOrCreate()

In [3]:
rdd = spark.sparkContext.textFile('../dataset1')

In [4]:
rdd.count()

1977

In [5]:
rdd.take(3)

[u'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
 u'Mozilla/5.0 (Linux; Android 7.0; SAMSUNG SM-G935F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/5.4 Chrome/51.0.2704.106 Mobile Safari/537.36',
 u'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36']

In [6]:
df = rdd.map(lambda string: parse(string))\
    .map(lambda user_agent: Row(
        **{
            'browser': user_agent.browser.family,
            'browser_version': user_agent.browser.version_string,
            'os': user_agent.os.family,
            'os_version': user_agent.os.version_string,
            'device': user_agent.device.family,
            'device_brand': user_agent.device.brand,
            'device_model': user_agent.device.model,
            'is_mobile': user_agent.is_mobile,
            'is_tablet': user_agent.is_tablet,
            'is_touch_capable': user_agent.is_touch_capable,
            'is_pc': user_agent.is_pc,
            'is_bot': user_agent.is_bot
        }
    )).toDF()\
    .cache()
df.show(5)

+----------------+---------------+----------------+------------+------------+------+---------+-----+---------+----------------+-----------+----------+
|         browser|browser_version|          device|device_brand|device_model|is_bot|is_mobile|is_pc|is_tablet|is_touch_capable|         os|os_version|
+----------------+---------------+----------------+------------+------------+------+---------+-----+---------+----------------+-----------+----------+
|         Firefox|           54.0|           Other|        null|        null| false|    false| true|    false|           false| Windows 10|          |
|Samsung Internet|            5.4|Samsung SM-G935F|     Samsung|    SM-G935F| false|     true|false|    false|            true|    Android|       7.0|
|          Chrome|      59.0.3071|           Other|        null|        null| false|    false| true|    false|           false|Windows 8.1|          |
|          Chrome|      59.0.3071|           Other|        null|        null| false|    false|

In [7]:
df.select('browser').where('is_bot = false').groupby('browser').count().sort(desc('count')).show()
print('total: {}'.format(df.select('browser').where('is_bot = false').count()))

+--------------------+-----+
|             browser|count|
+--------------------+-----+
|              Chrome|  463|
|       Chrome Mobile|  398|
|       Mobile Safari|  251|
|            Facebook|  236|
|                  IE|  115|
|Mobile Safari UI/...|  107|
|    Samsung Internet|   39|
|             Firefox|   33|
|                Edge|   21|
|   Chrome Mobile iOS|   20|
|              Safari|   18|
|              okhttp|   12|
|             Android|   10|
|             Maxthon|    6|
|               Opera|    6|
|        Opera Mobile|    5|
|           CFNetwork|    2|
|              Puffin|    1|
|           Crosswalk|    1|
|           Googlebot|    1|
+--------------------+-----+

total: 1745


In [8]:
df.select('browser').where('is_bot = true').groupby('browser').count().sort(desc('count')).show()
print('total: {}'.format(df.select('browser').where('is_bot = true').count()))

+-------------+-----+
|      browser|count|
+-------------+-----+
|    Googlebot|  218|
|    YandexBot|    7|
| Yahoo! Slurp|    4|
|AdsBot-Google|    2|
|       okhttp|    1|
+-------------+-----+

total: 232


In [9]:
df.where('is_pc = true')\
    .where('''
        browser = "Chrome" or 
        browser = "Chrome Mobile" or 
        browser = "Mobile Safari" or 
        browser = "Facebook" or 
        browser = "IE"
    ''')\
    .rollup('browser', 'browser_version').count().dropna().sort(desc('count')).show(5)

+-------+---------------+-----+
|browser|browser_version|count|
+-------+---------------+-----+
| Chrome|      59.0.3071|  339|
|     IE|           11.0|  103|
| Chrome|      49.0.2623|   39|
| Chrome|      60.0.3112|   32|
| Chrome|      58.0.3029|    7|
+-------+---------------+-----+
only showing top 5 rows



In [10]:
df.where('is_mobile = true')\
    .where('''
        browser = "Chrome" or 
        browser = "Chrome Mobile" or 
        browser = "Mobile Safari" or 
        browser = "Facebook" or 
        browser = "IE"
    ''')\
    .rollup('browser', 'browser_version').count().dropna().sort(desc('count')).show(5)

+-------------+---------------+-----+
|      browser|browser_version|count|
+-------------+---------------+-----+
|Chrome Mobile|      59.0.3071|  252|
|Mobile Safari|           10.0|  143|
|     Facebook|        134.0.0|  133|
|Chrome Mobile|      58.0.3029|   35|
|Mobile Safari|         10.3.2|   26|
+-------------+---------------+-----+
only showing top 5 rows



In [14]:
df.where('is_mobile = true and os = "Android"').select('device_brand').groupby('device_brand').count().sort(desc('count')).show(10)

+---------------+-----+
|   device_brand|count|
+---------------+-----+
|        Samsung|  224|
|           Asus|   85|
|Generic_Android|   84|
|            HTC|   82|
|         XiaoMi|   28|
|   SonyEricsson|   22|
|           Sony|   13|
|             LG|   12|
|           Oppo|    9|
|           vivo|    5|
+---------------+-----+
only showing top 10 rows



In [13]:
df.where('is_mobile = true').select('os').groupby('os').count().sort(desc('count')).show(10)

+---------+-----+
|       os|count|
+---------+-----+
|  Android|  572|
|      iOS|  457|
|    Linux|    1|
|Windows 7|    1|
+---------+-----+



In [12]:
df.where('is_pc = true').select('os').groupby('os').count().sort(desc('count')).show(10)

+-----------+-----+
|         os|count|
+-----------+-----+
|  Windows 7|  367|
| Windows 10|  154|
| Windows XP|   52|
|Windows 8.1|   29|
|   Mac OS X|   26|
|      Linux|    8|
|  Windows 8|    6|
|     Ubuntu|    2|
+-----------+-----+

