In [1]:
import spark_setup
spark_setup.setup_pyspark_env()
import spark_utils

In [2]:
%%time
sc = spark_utils.get_spark_context()

Ambari - http://10.0.1.21:8080
All Applications - http://10.0.1.23:8088/cluster
CPU times: user 12 ms, sys: 16 ms, total: 28 ms
Wall time: 15.1 s


In [11]:
from hdfs import InsecureClient
hdfs_client = InsecureClient("http://cluster1:50070", user='hdfs')

# Fake big data

In [4]:
import random
import string

In [5]:
import numpy as np

In [6]:
def generate_string(length):
    # return "A" * length
    # return ''.join(random.choice(string.letters) for _ in xrange(length))
    return np.random.random((length / 8,)).tostring()

In [7]:
%%time
_ = generate_string(1024*1024)
print len(_)

1048576
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.03 ms


In [8]:
import cPickle
%time dump = cPickle.dumps(generate_string(1024*1024))
%time _ = cPickle.loads(dump)

CPU times: user 60 ms, sys: 4 ms, total: 64 ms
Wall time: 66.1 ms
CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 17.5 ms


In [9]:
N_KEYS = 10
MB_PER_KEY = 1000
N_JOBS = 100

def mapper(x):
    for i in xrange(N_KEYS * MB_PER_KEY / N_JOBS):
        key = random.randint(1, N_KEYS)
        yield key, generate_string(1024 * 1024)

rdd = (
    sc
    .parallelize(range(N_JOBS), N_JOBS)
    .flatMap(mapper)
)

In [None]:
# from pyspark import StorageLevel
# rdd.persist(StorageLevel.DISK_ONLY)

# Pickle and Python

In [14]:
hdfs_client.delete("/user/ubuntu/bigDataPickles", recursive=True)

True

In [15]:
# 3min 27s - 97.7 GB - 471 MB/sec
%time rdd.saveAsPickleFile("hdfs:///user/ubuntu/bigDataPickles")

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 28.6 s


In [16]:
! hadoop fs -du -s -h hdfs:///user/ubuntu/bigDataPickles

9.8 G  hdfs:///user/ubuntu/bigDataPickles


In [25]:
%%time
_ = (
    sc.pickleFile("hdfs:///user/ubuntu/bigDataPickles")
    .groupByKey()
    .map(lambda (x, y): (x, max(y)))
    .collect()
)

CPU times: user 36 ms, sys: 28 ms, total: 64 ms
Wall time: 51.4 s


# Spark SQL

In [18]:
import pandas as pd
from pyspark.sql import SparkSession, Row

ss = SparkSession(sc)

df = ss.createDataFrame(
    rdd
    .map(lambda (x, y): Row(key=x, value=y))
)

df.printSchema()

root
 |-- key: long (nullable = true)
 |-- value: string (nullable = true)



In [19]:
# fair comparison
ss.conf.set("spark.sql.parquet.compression.codec", "uncompressed")

In [20]:
hdfs_client.delete("/user/ubuntu/bigData.parquet", recursive=True)

True

In [21]:
%%time
# http://events.linuxfoundation.org/sites/events/files/slides/ApacheCon%20BigData%20Europe%202016%20-%20Parquet%20in%20Practice%20%26%20Detail_0.pdf
df.write.save("hdfs:///user/ubuntu/bigData.parquet")

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 46.5 s


In [22]:
! hadoop fs -du -s -h hdfs:///user/ubuntu/bigData.parquet

15.2 G  hdfs:///user/ubuntu/bigData.parquet


In [24]:
%%time
_ = (
    ss.read.parquet("hdfs:///user/ubuntu/bigData.parquet")
    .groupby("key")
    .agg({"value": "max"})
    .collect()
)

CPU times: user 84 ms, sys: 108 ms, total: 192 ms
Wall time: 24.2 s


# Dumb local job

In [3]:
%%time
import numpy as np

def slow_mapper(x):
    s = 0
    for i in xrange(300):
        s = np.sum(x)
    return s

count = map(slow_mapper, np.random.random((100000,1000)))

CPU times: user 1min 27s, sys: 348 ms, total: 1min 28s
Wall time: 1min 29s


# Dumb Spark job

In [4]:
%%time
sc.parallelize(np.random.random((100000,1000)))\
    .map(slow_mapper)\
    .count()

CPU times: user 1.42 s, sys: 1.33 s, total: 2.75 s
Wall time: 22 s


100000

# Simple MapReduce

In [26]:
rdd = (
    sc
   .parallelize(["this is text", "text too"])
   .flatMap(lambda x: [(w, 1) for w in x.split()])
   .reduceByKey(lambda a, b: a + b))
print rdd
print rdd.collect()

PythonRDD[51] at RDD at PythonRDD.scala:48
[('text', 2), ('this', 1), ('too', 1), ('is', 1)]


# Broadcast + Accumulator example

In [27]:
bc = sc.broadcast({"this": 0, "is": 1, "text": 2})
errors = sc.accumulator(0)

def mapper(x):
    global errors
    for w in x.split():
        if w in bc.value:
            yield (bc.value[w], 1)
        else:
            errors += 1

rdd = (
    sc
   .parallelize(["this is text", "text too"])
   .flatMap(mapper)
   .reduceByKey(lambda a, b: a + b))
print rdd
print rdd.collect()
print "errors:", errors.value

PythonRDD[57] at RDD at PythonRDD.scala:48
[(0, 1), (1, 1), (2, 2)]
errors: 1


# DataFrame API example

In [28]:
import pandas as pd
from pyspark.sql import SparkSession

ss = SparkSession(sc)

In [29]:
df = pd.DataFrame(
    [["cat", [1, 1]], ["cat", [2]], ["dog", [1]]], 
    columns=["name", "cnt"])

In [30]:
df

Unnamed: 0,name,cnt
0,cat,"[1, 1]"
1,cat,[2]
2,dog,[1]


In [31]:
sdf = ss.createDataFrame(df)

In [32]:
sdf.printSchema()

root
 |-- name: string (nullable = true)
 |-- cnt: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [33]:
sdf.registerTempTable("animals")

In [34]:
ss.sql("""
select name, sum(cnt) as sum
from 
    (select name, explode(cnt) as cnt
     from animals)
group by name
""").toPandas()

Unnamed: 0,name,sum
0,dog,1
1,cat,4


# Kill workers

In [35]:
sc.stop()