In [1]:
import spark_setup
spark_setup.setup_pyspark_env()
import spark_utils

In [2]:
%%time
sc = spark_utils.get_spark_context()

Ambari - http://10.0.1.21:8080
All Applications - http://10.0.1.23:8088/cluster
CPU times: user 20 ms, sys: 4 ms, total: 24 ms
Wall time: 18 s


# Fake big data

In [3]:
import random
import string

In [4]:
import numpy as np

In [5]:
def generate_string(length):
    # return "A" * length
    # return ''.join(random.choice(string.letters) for _ in xrange(length))
    return np.random.random((length / 8,)).tostring()

In [6]:
%%time
_ = generate_string(1024*1024)
print len(_)

1048576
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.81 ms


In [7]:
import cPickle
%time dump = cPickle.dumps(generate_string(1024*1024))
%time _ = cPickle.loads(dump)

CPU times: user 60 ms, sys: 8 ms, total: 68 ms
Wall time: 71.3 ms
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 17.8 ms


In [8]:
N_KEYS = 100
MB_PER_KEY = 1000
N_JOBS = 100

def mapper(x):
    for i in xrange(N_KEYS * MB_PER_KEY / N_JOBS):
        key = random.randint(1, N_KEYS)
        yield key, generate_string(1024 * 1024)

rdd = (
    sc
    .parallelize(range(N_JOBS), N_JOBS)
    .flatMap(mapper)
)

In [None]:
# from pyspark import StorageLevel
# rdd.persist(StorageLevel.DISK_ONLY)

# Pickle and Python

In [None]:
! hadoop fs -rm -r -skipTrash hdfs:///user/ubuntu/bigDataPickles

In [10]:
# 3min 27s - 97.7 GB - 471 MB/sec
%time rdd.saveAsPickleFile("hdfs:///user/ubuntu/bigDataPickles")

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 3min 27s


In [11]:
! hadoop fs -du -s -h hdfs:///user/ubuntu/bigDataPickles

97.7 G  hdfs:///user/ubuntu/bigDataPickles


In [12]:
%%time
(
    sc.pickleFile("hdfs:///user/ubuntu/bigDataPickles")
    .groupByKey()
    .map(lambda (x, y): (x, max(y)))
    .collect()
)

CPU times: user 136 ms, sys: 108 ms, total: 244 ms
Wall time: 18min 1s


# Spark SQL

In [15]:
import pandas as pd
from pyspark.sql import SparkSession, Row

ss = (SparkSession
      .builder
      .appName("spark sql example")
      .getOrCreate())

df = ss.createDataFrame(
    rdd
    .map(lambda (x, y): Row(key=x, value=y))
)

df.printSchema()

root
 |-- key: long (nullable = true)
 |-- value: string (nullable = true)



In [18]:
# faster with uncompressed this time
ss.conf.set("spark.sql.parquet.compression.codec", "uncompressed")

In [20]:
! hadoop fs -rm -r -skipTrash hdfs:///user/ubuntu/bigData.parquet

Deleted hdfs:///user/ubuntu/bigData.parquet


In [None]:
%%time
# 5 min
# http://events.linuxfoundation.org/sites/events/files/slides/ApacheCon%20BigData%20Europe%202016%20-%20Parquet%20in%20Practice%20%26%20Detail_0.pdf
df.write.save("hdfs:///user/ubuntu/bigData.parquet")

In [22]:
! hadoop fs -du -s -h hdfs:///user/ubuntu/bigData.parquet

148.8 G  hdfs:///user/ubuntu/bigData.parquet
97.7 G  hdfs:///user/ubuntu/bigDataPickles


In [27]:
%%time
(
    ss.read.parquet("hdfs:///user/ubuntu/bigData.parquet")
    .groupby("key")
    .agg({"value": "max"})
    .collect()
)

CPU times: user 900 ms, sys: 308 ms, total: 1.21 s
Wall time: 16min


# Dumb local job

In [3]:
%%time
import numpy as np

def slow_mapper(x):
    s = 0
    for i in xrange(300):
        s = np.sum(x)
    return s

count = map(slow_mapper, np.random.random((100000,1000)))

CPU times: user 1min 27s, sys: 348 ms, total: 1min 28s
Wall time: 1min 29s


# Dumb Spark job

In [4]:
%%time
sc.parallelize(np.random.random((100000,1000)))\
    .map(slow_mapper)\
    .count()

CPU times: user 1.42 s, sys: 1.33 s, total: 2.75 s
Wall time: 22 s


100000

# Simple MapReduce

In [5]:
rdd = (
    sc
   .parallelize(["this is text", "text too"])
   .flatMap(lambda x: [(w, 1) for w in x.split()])
   .reduceByKey(lambda a, b: a + b))
print rdd
print rdd.collect()

PythonRDD[7] at RDD at PythonRDD.scala:48
[('text', 2), ('this', 1), ('too', 1), ('is', 1)]


# Broadcast + Accumulator example

In [6]:
bc = sc.broadcast({"this": 0, "is": 1, "text": 2})
errors = sc.accumulator(0)

def mapper(x):
    global errors
    for w in x.split():
        if w in bc.value:
            yield (bc.value[w], 1)
        else:
            errors += 1

rdd = (
    sc
   .parallelize(["this is text", "text too"])
   .flatMap(mapper)
   .reduceByKey(lambda a, b: a + b))
print rdd
print rdd.collect()
print "errors:", errors.value

PythonRDD[13] at RDD at PythonRDD.scala:48
[(0, 1), (1, 1), (2, 2)]
errors: 1


# DataFrame API example

In [10]:
import pandas as pd
from pyspark.sql import SparkSession

ss = (SparkSession
      .builder
      .appName("spark sql example")
      .getOrCreate())

In [8]:
df = pd.DataFrame(
    [["cat", [1, 1]], ["cat", [2]], ["dog", [1]]], 
    columns=["name", "cnt"])

In [9]:
df

Unnamed: 0,name,cnt
0,cat,"[1, 1]"
1,cat,[2]
2,dog,[1]


In [10]:
sdf = ss.createDataFrame(df)

In [11]:
sdf.printSchema()

root
 |-- name: string (nullable = true)
 |-- cnt: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [12]:
sdf.registerTempTable("animals")

In [13]:
ss.sql("""
select name, sum(cnt) as sum
from 
    (select name, explode(cnt) as cnt
     from animals)
group by name
""").toPandas()

Unnamed: 0,name,sum
0,dog,1
1,cat,4


# Kill workers

In [14]:
sc.stop()