In [1]:
import spark_setup
spark_setup.setup_pyspark_env()
import spark_utils

In [2]:
%%time
sc = spark_utils.get_spark_context()

Ambari - http://10.0.1.21:8080
All Applications - http://10.0.1.23:8088/cluster
CPU times: user 20 ms, sys: 8 ms, total: 28 ms
Wall time: 19.9 s


# Dumb local job

In [3]:
%%time
import numpy as np

def slow_mapper(x):
    s = 0
    for i in xrange(300):
        s = np.sum(x)
    return s

count = map(slow_mapper, np.random.random((100000,1000)))

CPU times: user 1min 32s, sys: 216 ms, total: 1min 32s
Wall time: 1min 33s


# Dumb Spark job

In [4]:
%%time
sc.parallelize(np.random.random((100000,1000)))\
    .map(slow_mapper)\
    .count()

CPU times: user 1.39 s, sys: 1.18 s, total: 2.57 s
Wall time: 16.8 s


100000

# Simple MapReduce

In [5]:
rdd = (
    sc
   .parallelize(["this is text", "text too"])
   .flatMap(lambda x: [(w, 1) for w in x.split()])
   .reduceByKey(lambda a, b: a + b))
print rdd
print rdd.collect()

PythonRDD[7] at RDD at PythonRDD.scala:48
[('text', 2), ('too', 1), ('is', 1), ('this', 1)]


# Broadcast + Accumulator example

In [6]:
bc = sc.broadcast({"this": 0, "is": 1, "text": 2})
errors = sc.accumulator(0)

def mapper(x):
    global errors
    for w in x.split():
        if w in bc.value:
            yield (bc.value[w], 1)
        else:
            errors += 1

rdd = (
    sc
   .parallelize(["this is text", "text too"])
   .flatMap(mapper)
   .reduceByKey(lambda a, b: a + b))
print rdd
print rdd.collect()
print "errors:", errors.value

PythonRDD[13] at RDD at PythonRDD.scala:48
[(0, 1), (1, 1), (2, 2)]
errors: 1


# DataFrame API example

In [7]:
import pandas as pd
from pyspark.sql import SparkSession

ss = (SparkSession
      .builder
      .appName("spark sql example")
      .getOrCreate())

In [8]:
df = pd.DataFrame(
    [["cat", [1, 1]], ["cat", [2]], ["dog", [1]]], 
    columns=["name", "cnt"])

In [9]:
df

Unnamed: 0,name,cnt
0,cat,"[1, 1]"
1,cat,[2]
2,dog,[1]


In [10]:
sdf = ss.createDataFrame(df)

In [11]:
sdf.printSchema()

root
 |-- name: string (nullable = true)
 |-- cnt: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [12]:
sdf.registerTempTable("animals")

In [13]:
ss.sql("""
select name, sum(cnt) as sum
from 
    (select name, explode(cnt) as cnt
     from animals)
group by name
""").toPandas()

Unnamed: 0,name,sum
0,dog,1
1,cat,4


# Kill workers

In [14]:
sc.stop()