# Imports

In [1]:
import sys
import os
import random
from operator import add, mul
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles

In [2]:
cwd=os.getcwd()
cwd

'C:\\Users\\byron\\Documents\\GitHub\\pyspark-training'

Create a SparkContext object and name it.

Use SparkContext.getOrCreate() to avoid error:
https://stackoverflow.com/questions/46351951/valueerror-cannot-run-multiple-sparkcontexts-at-once-in-spark-with-pyspark

In [3]:
app_name = 'RDD Hands-on'

In [4]:
conf = SparkConf().setAppName(app_name)
sc = SparkContext.getOrCreate(conf=conf)

In [5]:
sc.applicationId

'local-1576430619577'

# Datasets

In [6]:
textFile_path = ('..\pyspark-training\data\pyspark_classes.txt')
appleStore_path = ('..\pyspark-training\data\AppleStore.csv')
DATA_STR = 'PySpark is the Python API for Spark.'

# Create RDDs
* Parallelized Collection

A RDD can be created using a SparkContext object.

In [7]:
data = DATA_STR.split(' ')
pcoll = sc.parallelize(data)
# collect() is only good for small dataset
pcoll.collect()

['PySpark', 'is', 'the', 'Python', 'API', 'for', 'Spark.']

* From another RDD

In [8]:
rdd_from_rdd = pcoll.map(lambda word: word.upper())
rdd_from_rdd.collect()

['PYSPARK', 'IS', 'THE', 'PYTHON', 'API', 'FOR', 'SPARK.']

* From external data - n the form of files

In [9]:
text_file = sc.textFile(textFile_path)
text_file.filter(lambda line: line != '').collect()

['Public classes:',
 'SparkContext:',
 'Main entry point for Spark functionality.',
 'RDD:',
 'A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.',
 'Broadcast:',
 'A broadcast variable that gets reused across tasks.',
 'Accumulator:',
 'An “add-only” shared variable that tasks can only add values to.',
 'SparkConf:',
 'For configuring Spark.',
 'SparkFiles:',
 'Access files shipped with jobs.',
 'StorageLevel:',
 'Finer-grained cache persistence levels.',
 'TaskContext:',
 'Information about the current running task, avaialble on the workers and experimental.']

In [10]:
# flatMap: split lines in text file; Useful for word count
text_file.flatMap(lambda line: line.split(' ')).filter(lambda word: word!='').take(5)

['Public', 'classes:', 'SparkContext:', 'Main', 'entry']

# RDD Transformations
Transformation is a function that produces new RDD from the existing RDDs but when we want to work with the actual dataset, at that point Action is performed. When the action is triggered after the result, new RDD is not formed like transformation. Spark Transformation is a function hat produces new RDD from the existing RDDs. It takes RDD as input and produces one or more RDD as output. Each time it creates new RDD when we apply any transformation. Thus, the so input RDDs, cannot be changed since RDD are immutable in nature.

Applying transformation built an RDD lineage, with the entire parent RDDs of the final RDD(s). RDD lineage, also known as RDD operator graph or RDD dependency graph. It is a logical execution plan i.e., it is Direct Acyclic Graph (DAG) or the entire parent RDDs of RDD.

Transformations are lazy in nature ie.e, they get executed when we call an action. They are not executed immediately. Two most basic type of transformation is a map(), filter(). After the transformation, the resultant RDD is always different from its parent RDD. It can be smaller (e.g. filter, count, distinct, sample), bigger (e.g. flatMap(), union(), Cartesian()) or the same size (e.g. map).

### map, filter, flatMap, distinct, sortBy, groupBy

In [11]:
rdd = sc.textFile(appleStore_path)
# map: replace comma in string and create a list for each record
lines = rdd.map(lambda line: line.replace(", ", " ")).map(lambda line: line.split(','))
lines.take(2)

[['""',
  '"id"',
  '"track_name"',
  '"size_bytes"',
  '"currency"',
  '"price"',
  '"rating_count_tot"',
  '"rating_count_ver"',
  '"user_rating"',
  '"user_rating_ver"',
  '"ver"',
  '"cont_rating"',
  '"prime_genre"',
  '"sup_devices.num"',
  '"ipadSc_urls.num"',
  '"lang.num"',
  '"vpp_lic"'],
 ['"1"',
  '"281656475"',
  '"PAC-MAN Premium"',
  '100788224',
  '"USD"',
  '3.99',
  '21292',
  '26',
  '4',
  '4.5',
  '"6.3.5"',
  '"4+"',
  '"Games"',
  '38',
  '5',
  '10',
  '1']]

In [12]:
# check number of columns in each record
lines_wrong_parsing = lines.filter(lambda line: len(line) > 17).map(lambda cols: cols[0])
print(lines_wrong_parsing.count())
lines_wrong_parsing.take(5)

34


['"175"', '"289"', '"348"', '"418"', '"554"']

In [13]:
rdd.filter(lambda line: line.startswith('"554"')).collect()

['"554","385285922","乐视视频-白鹿原,欢乐颂,奔跑吧全网热播",184689664,"USD",0,1590,6,4.5,5,"7.1","17+","Entertainment",38,0,2,1']

In [14]:
# filter: getting rid of header and rows can't be parsed correctly
lines = lines.filter(lambda cols: cols[0] != '""').filter(lambda line: len(line) == 17)
lines.count()

7163

In [15]:
# distinct: check whether there are duplicate records
ids = lines.map(lambda cols: cols[0])
print(ids.count())
print(ids.distinct().count())

7163
7163


In [16]:
# sortBy: get the top 5 apps with the most total rating
lines.sortBy(lambda line: int(line[6]), False).map(lambda cols: cols[2]+": "+cols[6]).take(5)

['"Facebook": 2974676',
 '"Instagram": 2161558',
 '"Clash of Clans": 2130805',
 '"Temple Run": 1724546',
 '"Pandora - Music & Radio": 1126879']

In [17]:
# groupBy: group the apps by genre - returns a literable object for each group
grp = lines.groupBy(lambda cols: cols[12])
grp_sorted = grp.sortBy(lambda group: len(list(group[1])), False)
grp_count = grp_sorted.map(lambda group: group[0] + ": " + str(len(list(group[1]))))
grp_count.collect()

['"Games": 3848',
 '"Entertainment": 531',
 '"Education": 453',
 '"Photo & Video": 346',
 '"Utilities": 247',
 '"Health & Fitness": 178',
 '"Productivity": 178',
 '"Social Networking": 165',
 '"Lifestyle": 141',
 '"Music": 138',
 '"Shopping": 121',
 '"Sports": 114',
 '"Book": 112',
 '"Finance": 103',
 '"Travel": 80',
 '"News": 75',
 '"Weather": 72',
 '"Reference": 64',
 '"Food & Drink": 62',
 '"Business": 56',
 '"Navigation": 46',
 '"Medical": 23',
 '"Catalogs": 10']

In [18]:
# key-value expression for grouped RDD
print([(k, len(list(v))) for k, v in grp.take(5)])

[('"Food & Drink"', 62), ('"Games"', 3848), ('"Shopping"', 121), ('"Business"', 56), ('"Health & Fitness"', 178)]


# Key-Value Pair RDD (a.k.a PairRDD)

### keyBy, foldByKey, reduceByKey, groupByKey, lookup, mapValues, collectAsMap, countByKey, sortByKey, sampleByKey

In [19]:
words = text_file.flatMap(lambda line: line.replace(':','').split(' ')).filter(lambda word: word != '')
keywords = words.keyBy(lambda word: word.lower()[0])
keywords.take(10)

[('p', 'Public'),
 ('c', 'classes'),
 ('s', 'SparkContext'),
 ('m', 'Main'),
 ('e', 'entry'),
 ('p', 'point'),
 ('f', 'for'),
 ('s', 'Spark'),
 ('f', 'functionality.'),
 ('r', 'RDD')]

In [20]:
# foldByKey - require a value may be added to the result an arbitrary number of times,
# and must not change the result (e.g. 0 for addition, or 1 for multiplication)
word_counts = words.map(lambda word: (word.replace('.','').lower(), 1))
word_counts.foldByKey(0, add).take(10)

[('configuring', 1),
 ('cache', 1),
 ('only', 1),
 ('sparkcontext', 1),
 ('(rdd),', 1),
 ('variable', 2),
 ('public', 1),
 ('tasks', 2),
 ('files', 1),
 ('storagelevel', 1)]

In [21]:
# reduceByKey
word_counts.reduceByKey(lambda a,b: a+b).take(10)

[('configuring', 1),
 ('cache', 1),
 ('only', 1),
 ('sparkcontext', 1),
 ('(rdd),', 1),
 ('variable', 2),
 ('public', 1),
 ('tasks', 2),
 ('files', 1),
 ('storagelevel', 1)]

In [22]:
# groupByKey
keyword_freq = keywords.groupByKey()
print(list((w[0], len(list(w[1]))) for w in keyword_freq.collect()))

[('g', 1), ('s', 9), ('“', 1), ('b', 3), ('l', 1), ('r', 4), ('c', 5), ('i', 2), ('p', 3), ('j', 1), ('d', 2), ('e', 2), ('v', 3), ('m', 1), ('o', 2), ('f', 5), ('a', 11), ('t', 10), ('w', 2), ('(', 1)]


##### Note: groupBy and groupByKey can cause out of memory exceptions and are expensive operations.

While both groupByKey and reduceByKey can produce the correct answer, the reduceByKey example works much better on a large dataset. That's because Spark knows it can combine output with a common key on each partition before shuffling the data.

In [23]:
# lookup
keywords.lookup('s')

['SparkContext',
 'Spark',
 'Spark.',
 'shared',
 'SparkConf',
 'Spark.',
 'SparkFiles',
 'shipped',
 'StorageLevel']

In [24]:
# mapValues
keywords.mapValues(lambda word: word.upper()).take(5)

[('p', 'PUBLIC'),
 ('c', 'CLASSES'),
 ('s', 'SPARKCONTEXT'),
 ('m', 'MAIN'),
 ('e', 'ENTRY')]

In [25]:
# sortByKey (using appstore data)
pair = lines.map(lambda line: (line[0], line[2]))
pair.sortByKey().take(5)

[('"1"', '"PAC-MAN Premium"'),
 ('"10"', '"Ms. PAC-MAN"'),
 ('"100"', '"Tempo - Metronome with Setlists"'),
 ('"1000"', '"PDF Converter - Convert Documents Photos to PDF"'),
 ('"10000"', '"Bus Simulator PRO 2017"')]

In [26]:
# countByKey
keywords.countByKey()

defaultdict(int,
            {'(': 1,
             'a': 11,
             'b': 3,
             'c': 5,
             'd': 2,
             'e': 2,
             'f': 5,
             'g': 1,
             'i': 2,
             'j': 1,
             'l': 1,
             'm': 1,
             'o': 2,
             'p': 3,
             'r': 4,
             's': 9,
             't': 10,
             'v': 3,
             'w': 2,
             '“': 1})

In [27]:
# sampleByKey (deterministic(T/F), fraction, seed)
distinctChars = words.flatMap(lambda word: list(word.lower())).distinct().collect()
print(distinctChars)
    # assign a random number to each character -> 
        # the percentage samples related to the key to be draw
sampleMap = dict(map(lambda c: (c, random.random()), distinctChars))
print(sampleMap)
words.map(lambda word: (word.lower()[0], word)).sampleByKey(True, sampleMap, 6).collect()

['s', 'b', 'h', 'i', '.', 'p', 'd', 'g', 'l', 'c', 'y', '“', 'j', 'r', ',', 't', 'e', 'k', 'x', '-', 'a', 'f', 'n', '(', 'm', '”', 'v', 'o', 'u', ')', 'w']
{')': 0.008953684383868121, 'u': 0.026902177687455175, 'k': 0.3448540677574369, 'x': 0.7552635674996473, '-': 0.97775959373821, 'i': 0.26437577711493276, 'a': 0.5377169379396061, 'f': 0.5431578466842806, 'p': 0.4242352354851444, 'd': 0.7923890721797011, '(': 0.9435348472204684, 'g': 0.18427646027941935, 'l': 0.17174585638652773, '”': 0.5790377391313657, 'c': 0.38377829630176985, ',': 0.2806753902658189, 'y': 0.12262615340007577, 'e': 0.7460624175554434, 'w': 0.5724674011077882, 'n': 0.5668159712275453, 'b': 0.6321963742423565, 'h': 0.046067056892209135, 'm': 0.7828488731085762, '.': 0.4598671439552072, 't': 0.781857179104545, 's': 0.015724672011185326, 'r': 0.08701146150158123, 'o': 0.7358846420832192, 'v': 0.6711551498055514, '“': 0.8340043969601465, 'j': 0.11867723815618325}


[('c', 'classes'),
 ('m', 'Main'),
 ('f', 'for'),
 ('f', 'functionality.'),
 ('d', 'Distributed'),
 ('t', 'the'),
 ('b', 'basic'),
 ('b', 'basic'),
 ('a', 'abstraction'),
 ('i', 'in'),
 ('i', 'in'),
 ('b', 'Broadcast'),
 ('b', 'Broadcast'),
 ('b', 'Broadcast'),
 ('b', 'broadcast'),
 ('v', 'variable'),
 ('t', 'that'),
 ('t', 'that'),
 ('t', 'tasks.'),
 ('a', 'An'),
 ('a', 'An'),
 ('“', '“add-only”'),
 ('v', 'variable'),
 ('v', 'variable'),
 ('o', 'only'),
 ('v', 'values'),
 ('t', 'to.'),
 ('f', 'files'),
 ('f', 'files'),
 ('w', 'with'),
 ('t', 'TaskContext'),
 ('t', 'task,'),
 ('o', 'on'),
 ('t', 'the')]

# RDD Functions - Math / Statistical
These are functions (precisely actions) that are supported over RDD of doubles (in Java parlance). In PySpark, these are regular functions that expect list of int/float values.

### min, max, mean, sum, variance, stats

In [28]:
user_ratings = lines.map(lambda cols: float(cols[8]))
user_ratings.take(5)

[4.0, 4.0, 3.5, 4.0, 4.5]

In [29]:
print("Min:", user_ratings.min())
print("Max:", user_ratings.max())
print("Sum:", user_ratings.sum())
print("Mean:", user_ratings.mean())
print("Std:", user_ratings.stdev())
print("Variance:", user_ratings.variance())

Min: 0.0
Max: 5.0
Sum: 25289.0
Mean: 3.530503978779842
Std: 1.5148105385
Variance: 2.294650967560603


In [30]:
user_ratings.stats()

(count: 7163, mean: 3.530503978779842, stdev: 1.5148105385, max: 5.0, min: 0.0)

# RDD Actions

#### first, count, collect, take, top, reduce, takeOrdered, countByValue, countByKey

In [31]:
total_rating_count = lines.map(lambda cols: int(cols[6]))
# expensive when the data is big
total_rating_count.top(3)

[2974676, 2161558, 2130805]

In [32]:
pcoll.collect()

['PySpark', 'is', 'the', 'Python', 'API', 'for', 'Spark.']

In [33]:
pcoll.top(3)

['the', 'is', 'for']

In [34]:
pcoll.first()

'PySpark'

In [35]:
rdd_num = sc.parallelize([1,2,3,4,5])
print(rdd_num.reduce(add))
print(rdd_num.reduce(mul))

15
120


In [36]:
# expensive when the data is big
total_rating_count.takeOrdered(3)

[0, 0, 0]

In [37]:
words.countByValue().items()

dict_items([('cache', 1), ('Main', 1), ('Dataset', 1), ('Information', 1), ('Accumulator', 1), ('that', 2), ('for', 1), ('shipped', 1), ('Spark.', 2), ('about', 1), ('the', 3), ('classes', 1), ('point', 1), ('workers', 1), ('experimental.', 1), ('avaialble', 1), ('persistence', 1), ('functionality.', 1), ('entry', 1), ('tasks.', 1), ('For', 1), ('gets', 1), ('current', 1), ('Finer-grained', 1), ('basic', 1), ('running', 1), ('SparkContext', 1), ('jobs.', 1), ('StorageLevel', 1), ('“add-only”', 1), ('A', 2), ('configuring', 1), ('only', 1), ('shared', 1), ('variable', 2), ('across', 1), ('RDD', 1), ('task,', 1), ('files', 1), ('in', 1), ('values', 1), ('Public', 1), ('Access', 1), ('add', 1), ('Broadcast', 1), ('reused', 1), ('on', 1), ('SparkFiles', 1), ('with', 1), ('and', 1), ('Spark', 1), ('tasks', 1), ('abstraction', 1), ('SparkConf', 1), ('An', 1), ('can', 1), ('Distributed', 1), ('Resilient', 1), ('(RDD),', 1), ('TaskContext', 1), ('to.', 1), ('broadcast', 1), ('levels.', 1)])

In [38]:
keywords.countByKey().items()

dict_items([('s', 9), ('b', 3), ('m', 1), ('o', 2), ('r', 4), ('a', 11), ('j', 1), ('t', 10), ('p', 3), ('d', 2), ('(', 1), ('g', 1), ('i', 2), ('f', 5), ('c', 5), ('v', 3), ('l', 1), ('e', 2), ('“', 1), ('w', 2)])

# RDD Partitions & Parallelism

In [39]:
rdd = sc.parallelize(range(25), 3)
rdd.getNumPartitions()

3

# Shared Variables
- Broadcast

In [40]:
b = sc.broadcast([3,6,9,12])
b.value

[3, 6, 9, 12]

In [41]:
sc.parallelize([0,0]).flatMap(lambda x: b.value).collect() # each 0 will be replaced by broadcasted value

[3, 6, 9, 12, 3, 6, 9, 12]

In [42]:
b.unpersist() # using unpersist only removes broadcast variable from executor
sc.parallelize([0,0]).flatMap(lambda x: b.value).collect()

[3, 6, 9, 12, 3, 6, 9, 12]

In [43]:
# to remove it from driver as well use
b.destroy()
#sc.parallelize([0,0]).flatMap(lambda x: b.value).collect() # results in error as boradcast variable is destroyed

- Accumulators

In [44]:
a = sc.accumulator(0)
a

Accumulator<id=0, value=0>

In [45]:
sc.parallelize([3,4,5]).foreach(lambda x: a.add(x))
a.value

12

In [46]:
# resetting an accumulator
a.value = 0
a.value

0

In [47]:
a.value += 3
a.value

3