In [1]:
# this chapter summarize topics . - aggregations and key-value RDDs - custom partitioning - RDD joins

In [2]:
mycollection = "spark the definitive guide : the big data processing made simple".split()

In [3]:
mycollection

['spark',
 'the',
 'definitive',
 'guide',
 ':',
 'the',
 'big',
 'data',
 'processing',
 'made',
 'simple']

In [4]:
words = sc.parallelize(mycollection,2)

In [8]:
wordsmap = words.map(lambda word : (word.upper(),1))
# map : maps every word in the list , to it's upper version and #1

In [9]:
wordsmap.take(5)

[('SPARK', 1), ('THE', 1), ('DEFINITIVE', 1), ('GUIDE', 1), (':', 1)]

In [11]:
keyword = words.keyBy(lambda word : word.lower()[0])
keyword.take(5)

[('s', 'spark'), ('t', 'the'), ('d', 'definitive'), ('g', 'guide'), (':', ':')]

In [13]:
keyword.mapValues(lambda word : word.upper()).collect()

[('s', 'SPARK'),
 ('t', 'THE'),
 ('d', 'DEFINITIVE'),
 ('g', 'GUIDE'),
 (':', ':'),
 ('t', 'THE'),
 ('b', 'BIG'),
 ('d', 'DATA'),
 ('p', 'PROCESSING'),
 ('m', 'MADE'),
 ('s', 'SIMPLE')]

In [15]:
#keyword.flatMapValues(lambda word : word.upper()).collect()

In [17]:
#extract keys and values from rdd
print (keyword.keys().collect())
print (keyword.values().collect())

['s', 't', 'd', 'g', ':', 't', 'b', 'd', 'p', 'm', 's']
['spark', 'the', 'definitive', 'guide', ':', 'the', 'big', 'data', 'processing', 'made', 'simple']


In [19]:
# lookup the result for particular keys
keyword.lookup('s')

['spark', 'simple']

In [20]:
#samplebykey  : to sample rdd by set of keys,
# we can do it approximation or exactly, with replacement or without replacement

In [45]:
import random 
distinctchars = words.flatMap(lambda word : list(word.lower())).distinct().collect()
sampleMap = dict(map(lambda c: (c, random.random()), distinctchars))
words.map(lambda word: (word.lower()[0], word)).sampleByKey(True, sampleMap, 6).collect()

[('d', 'definitive'), (':', ':'), ('s', 'simple'), ('s', 'simple')]

# aggregations 

In [33]:
chars = words.flatMap(lambda word:word.lower())
# return list of chars
kvcharacters = chars.map(lambda letter : (letter,1))
# return key value for each letter like ('s', 1)

In [34]:
def maxFun(left, right):
    return max(left, right)

def addFun(left, right):
    return left+right
nums = sc.parallelize(range(1,31) , 5)

In [37]:
kvcharacters.countByKey()
# count the frequence of single items .

defaultdict(int,
            {'s': 4,
             'p': 3,
             'a': 4,
             'r': 2,
             'k': 1,
             't': 4,
             'h': 2,
             'e': 8,
             'd': 4,
             'f': 1,
             'i': 7,
             'n': 2,
             'v': 1,
             'g': 3,
             'u': 1,
             ':': 1,
             'b': 1,
             'o': 1,
             'c': 1,
             'm': 2,
             'l': 1})

In [38]:
# difference between groupByKey and reduceByKey
#groupByKey : each executors hold all values for a given key in memory, before applying the function to them 
# this can lead to outofmemoryerrors , it affects with large datasets.it can happen
#there is a good approach "reduceByKey"

In [39]:
# to make simple count , must use flatmap, and then use map to map each letter with number,
#then perform reduceByKey with summation function.
# reduce happen within the partition and doesnot need to put everything in memory.there is no incurred during the operation

In [41]:
kvcharacters.reduceByKey(addFun).collect()
#kvcharacters.reduceByKey(lambda x,y:x+y).collect()

[('s', 4),
 ('p', 3),
 ('r', 2),
 ('h', 2),
 ('d', 4),
 ('i', 7),
 ('g', 3),
 ('b', 1),
 ('c', 1),
 ('l', 1),
 ('a', 4),
 ('k', 1),
 ('t', 4),
 ('e', 8),
 ('f', 1),
 ('n', 2),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('o', 1),
 ('m', 2)]

In [42]:
# look at [aggregate , aggregateByKey , combineByKey ,foldByKey, Cogroups]

## joins

In [52]:
distinctchars = words.flatMap(lambda word : list(word.lower())).distinct()
distinctchars.map(lambda c:(c,random.random())).collect()


[('s', 0.8483401344130305),
 ('p', 0.6175334203806567),
 ('r', 0.44253735737429534),
 ('h', 0.004711285990537606),
 ('d', 0.17620064565192106),
 ('i', 0.42354662330046744),
 ('g', 0.5955476693020821),
 ('b', 0.31968779942779224),
 ('c', 0.27071645876217754),
 ('l', 0.5239463007417778),
 ('a', 0.5651157182361087),
 ('k', 0.4748421103620156),
 ('t', 0.3088462283026707),
 ('e', 0.7529273129256479),
 ('f', 0.27563749864730513),
 ('n', 0.23461206369306986),
 ('v', 0.26096463617434684),
 ('u', 0.3650090191566),
 (':', 0.7256531856697708),
 ('o', 0.8716548735486849),
 ('m', 0.4059385600881832)]

In [54]:
kededchars = distinctchars.map(lambda c:(c,random.random()))
outputpartitions = 10
print (kvcharacters.join(kededchars).count() )
print (kvcharacters.join(kededchars, outputpartitions).count())

54
54


In [57]:
#zips : add two rdds together
myrange = sc.parallelize(range(11) , 2)
words.zip(myrange).collect()

[('spark', 0),
 ('the', 1),
 ('definitive', 2),
 ('guide', 3),
 (':', 4),
 ('the', 5),
 ('big', 6),
 ('data', 7),
 ('processing', 8),
 ('made', 9),
 ('simple', 10)]

In [56]:
words.count()

11