In [1]:
import spark_setup
spark_setup.setup_pyspark_env()
import spark_utils

In [2]:
%%time
sc = spark_utils.get_spark_context()

Ambari - http://10.0.1.21:8080
All Applications - http://10.0.1.23:8088/cluster
CPU times: user 20 ms, sys: 8 ms, total: 28 ms
Wall time: 29.6 s


In [3]:
import pandas as pd
from pyspark.sql import SparkSession

ss = SparkSession(sc)

In [4]:
from hdfs import InsecureClient
hdfs_client = InsecureClient("http://cluster1:50070", user='hdfs')

# Load data to HDFS

In [5]:
import time

def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print '%r (%r, %r) %2.2f sec' % \
              (method.__name__, args, kw, te-ts)
        return result
    return timed

In [None]:
hdfs_client.delete("/task1", recursive=True)

False

In [None]:
%%time
import subprocess

@timeit
def unzip_to_hdfs(fn):
    fn_out = fn.replace(".zip", "")
    print subprocess.check_output("unzip -p /data/{0} | hadoop fs -put - /task1/{1}".format(fn, fn_out), shell=True)
    
fns = [
    "clicks_test.csv.zip",
    "clicks_train.csv.zip",
    "documents_categories.csv.zip",
    "documents_entities.csv.zip",
    "documents_meta.csv.zip",
    "documents_topics.csv.zip",
    "events.csv.zip",
    "page_views.csv.zip",
    "page_views_sample.csv.zip",
    "promoted_content.csv.zip",
    "sample_submission.csv.zip"
]

for fn in fns:
    unzip_to_hdfs(fn)


'unzip_to_hdfs' (('clicks_test.csv.zip',), {}) 7.98 sec

'unzip_to_hdfs' (('clicks_train.csv.zip',), {}) 21.13 sec

'unzip_to_hdfs' (('documents_categories.csv.zip',), {}) 3.34 sec

'unzip_to_hdfs' (('documents_entities.csv.zip',), {}) 6.14 sec

'unzip_to_hdfs' (('documents_meta.csv.zip',), {}) 2.91 sec

'unzip_to_hdfs' (('documents_topics.csv.zip',), {}) 6.78 sec

'unzip_to_hdfs' (('events.csv.zip',), {}) 23.32 sec

'unzip_to_hdfs' (('page_views.csv.zip',), {}) 1358.40 sec

'unzip_to_hdfs' (('page_views_sample.csv.zip',), {}) 8.06 sec

'unzip_to_hdfs' (('promoted_content.csv.zip',), {}) 2.25 sec

'unzip_to_hdfs' (('sample_submission.csv.zip',), {}) 7.14 sec
CPU times: user 168 ms, sys: 84 ms, total: 252 ms
Wall time: 24min 7s


In [None]:
! hadoop fs -du -s -h /task1

92.9 G  /task1


In [None]:
# files are written on cluster1 node only, need to balance HDFS on cluster

In [None]:
! hdfs dfsadmin -setBalancerBandwidth 1000000000

Balancer bandwidth is set to 1000000000


In [None]:
%%time
! hdfs balancer -threshold 5 > balancer.log 2>&1

CPU times: user 6.48 s, sys: 2.22 s, total: 8.7 s
Wall time: 6min 21s


# Read example

In [None]:
pvdf = ss.read.csv("/task1/page_views.csv", header=True)

In [None]:
pvdf.dtypes

[('uuid', 'string'),
 ('document_id', 'string'),
 ('timestamp', 'string'),
 ('platform', 'string'),
 ('geo_location', 'string'),
 ('traffic_source', 'string')]

In [None]:
pvdf.show(5)

+--------------+-----------+---------+--------+------------+--------------+
|          uuid|document_id|timestamp|platform|geo_location|traffic_source|
+--------------+-----------+---------+--------+------------+--------------+
|1fd5f051fba643|        120| 31905835|       1|          RS|             2|
|8557aa9004be3b|        120| 32053104|       1|       VN>44|             2|
|c351b277a358f0|        120| 54013023|       1|       KR>12|             1|
|8205775c5387f9|        120| 44196592|       1|       IN>16|             2|
|9cb0ccd8458371|        120| 65817371|       1|   US>CA>807|             2|
+--------------+-----------+---------+--------+------------+--------------+
only showing top 5 rows



In [None]:
%%time
pvdf.count()

CPU times: user 76 ms, sys: 20 ms, total: 96 ms
Wall time: 9min 16s


2034275448

# Parquet is faster than CSV

In [None]:
%%time
pvdf.write.parquet("/task1/page_views.parquet")

In [31]:
! hadoop fs -du -s -h /task1/page_views.parquet

47.3 G  /task1/page_views.parquet


In [26]:
%%time
pvdf2 = ss.read.parquet("/task1/page_views.parquet")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 606 ms


In [28]:
%%time
pvdf2.groupBy("geo_location").count().collect()

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 30 s


[Row(geo_location=u'ES>07', count=139257),
 Row(geo_location=u'US>MT>756', count=676540),
 Row(geo_location=u'LT', count=145441),
 Row(geo_location=u'IL>01', count=21174),
 Row(geo_location=u'DZ', count=141209),
 Row(geo_location=u'NL>10', count=58411),
 Row(geo_location=u'CO>02', count=274301),
 Row(geo_location=u'BG>50', count=11883),
 Row(geo_location=u'IE>16', count=106415),
 Row(geo_location=u'US>MS>673', count=849299),
 Row(geo_location=u'US>NY', count=420207),
 Row(geo_location=u'TH>46', count=70526),
 Row(geo_location=u'MA>57', count=28435),
 Row(geo_location=u'AE>05', count=11754),
 Row(geo_location=u'MM', count=5180),
 Row(geo_location=u'ZM>03', count=6435),
 Row(geo_location=u'CM>09', count=734),
 Row(geo_location=u'US>MT>764', count=1270),
 Row(geo_location=u'EC>03', count=14),
 Row(geo_location=u'DZ>42', count=5),
 Row(geo_location=u'US>FL>548', count=7719485),
 Row(geo_location=u'SE>26', count=1134528),
 Row(geo_location=u'BR>08', count=54749),
 Row(geo_location=u'BZ>01',

In [29]:
%%time
pvdf.groupBy("geo_location").count().collect()

CPU times: user 84 ms, sys: 12 ms, total: 96 ms
Wall time: 9min 19s


[Row(geo_location=u'LT', count=145441),
 Row(geo_location=u'TH>46', count=70526),
 Row(geo_location=u'US>NY', count=420207),
 Row(geo_location=u'US>MT>756', count=676540),
 Row(geo_location=u'IE>16', count=106415),
 Row(geo_location=u'ES>07', count=139257),
 Row(geo_location=u'US>MS>673', count=849299),
 Row(geo_location=u'DZ', count=141209),
 Row(geo_location=u'IL>01', count=21174),
 Row(geo_location=u'NL>10', count=58411),
 Row(geo_location=u'MA>57', count=28435),
 Row(geo_location=u'CO>02', count=274301),
 Row(geo_location=u'AE>05', count=11754),
 Row(geo_location=u'BG>50', count=11883),
 Row(geo_location=u'ZM>03', count=6435),
 Row(geo_location=u'MM', count=5180),
 Row(geo_location=u'CM>09', count=734),
 Row(geo_location=u'US>MT>764', count=1270),
 Row(geo_location=u'EC>03', count=14),
 Row(geo_location=u'DZ>42', count=5),
 Row(geo_location=u'SE>26', count=1134528),
 Row(geo_location=u'EC>18', count=255995),
 Row(geo_location=u'US>FL>548', count=7719485),
 Row(geo_location=u'BZ>01'