In [2]:
myCollection = '''Please could you stop the noise?
I'm trying to get some rest
From all the unborn chicken
Voices in my head
What's that?
(I may be paranoid, but not an android)
What's that?
(I may be paranoid, but not an android)
When I am king
You will be first against the wall
With your opinion
Which is of no consequence at all'''.replace('\n', ' ').split(' ')
words = spark.sparkContext.parallelize(myCollection, 3) # the second parameter specifies the number of partitions
words

ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:262

## Partitions

### mapPartitions

In [2]:
def text_by_partition(partition_index, records):
    words_in_part = ' '.join(records)
    return [f'partition: {partition_index} => {words_in_part}']
words.mapPartitionsWithIndex(text_by_partition).collect()

                                                                                

["partition: 0 => Please could you stop the noise? I'm trying to get some rest From all the unborn chicken Voices in my head What's that? (I may be paranoid, but not an android)",
 "partition: 1 => What's that? (I may be paranoid, but not an android) When I am king You will be first against the wall With your opinion Which is of no consequence at all"]

### foreachPartition

In [3]:
def write_text_by_parition(records):
    import random
    file_name = f'/notebooks/foreachpartition_out/{str(random.random())[2:]}.txt'
    with open(file_name, 'w') as file:
        for word in records:
            file.write(f'{word}\n')
        

words.foreachPartition(write_text_by_parition)

                                                                                

## Key-Value RDDs

### Creating a KV RDD

In [4]:
words.map(lambda word: (word.lower(), 1)).take(5)

[('please', 1), ('could', 1), ('you', 1), ('stop', 1), ('the', 1)]

In [5]:
keyword = words.keyBy(lambda word: word.lower()[0])
keyword.take(5)

[('p', 'Please'), ('c', 'could'), ('y', 'you'), ('s', 'stop'), ('t', 'the')]

### Mapping over values

In [6]:
keyword.mapValues(lambda word: word.upper()).take(5)

[('p', 'PLEASE'), ('c', 'COULD'), ('y', 'YOU'), ('s', 'STOP'), ('t', 'THE')]

In [7]:
keyword.flatMapValues(lambda word: word.upper()).takeSample(True, 10)

[('a', 'T'),
 ('a', 'L'),
 ('a', 'N'),
 ('b', 'B'),
 ('p', 'O'),
 ('a', 'D'),
 ('a', 'I'),
 ('n', 'I'),
 ('i', 'I'),
 ('p', 'E')]

### Extracting keys and values

In [8]:
print(keyword.keys().take(5))
print(keyword.values().take(5))

['p', 'c', 'y', 's', 't']
['Please', 'could', 'you', 'stop', 'the']


### lookup
Note that there is no enforcement mechanism with respect to there being only one key for each input, so if we lookup “s”, we are going to get both values associated with that

In [9]:
keyword.lookup("s")

['stop', 'some']

## Aggregations

### countByKey

In [10]:
chars = words.flatMap(lambda word: word.lower())
kv_chars = chars.map(lambda letter: (letter, 1))
kv_chars.countByKey()

defaultdict(int,
            {'p': 5,
             'l': 10,
             'e': 19,
             'a': 23,
             's': 12,
             'c': 7,
             'o': 22,
             'u': 8,
             'd': 8,
             'y': 7,
             't': 22,
             'h': 13,
             'n': 22,
             'i': 22,
             '?': 3,
             "'": 3,
             'm': 7,
             'r': 10,
             'g': 4,
             'f': 3,
             'b': 6,
             'k': 2,
             'v': 1,
             'w': 7,
             '(': 2,
             ',': 2,
             ')': 2,
             'q': 1})

In [11]:
### groupByKey

In [12]:
kv_chars.groupByKey().take(5)

                                                                                

[('h', <pyspark.resultiterable.ResultIterable at 0x7ff1b4bcc390>),
 ('s', <pyspark.resultiterable.ResultIterable at 0x7ff1b4bcc490>),
 ('?', <pyspark.resultiterable.ResultIterable at 0x7ff1b4bcc090>),
 ('i', <pyspark.resultiterable.ResultIterable at 0x7ff1b4bcc350>),
 ('y', <pyspark.resultiterable.ResultIterable at 0x7ff1b4bcc050>)]

Now sum up all the records in a group:

In [13]:
def sum_key_values(row):
    key = row[0]
    values = row[1]
    total = sum(v for v in values)
    # note that values are collected in memory, which may result in an memory error when there are too many values for a key
    return key, total, type(values)
kv_chars.groupByKey().map(sum_key_values).take(5)

[('p', 5, pyspark.resultiterable.ResultIterable),
 ('l', 10, pyspark.resultiterable.ResultIterable),
 ('s', 12, pyspark.resultiterable.ResultIterable),
 ('c', 7, pyspark.resultiterable.ResultIterable),
 ('d', 8, pyspark.resultiterable.ResultIterable)]

### reduceByKey
A safe alternative to summing using groupByKey

In [15]:
# groups of values are not loaded into memory
kv_chars.reduceByKey(lambda left_value, right_value: left_value + right_value ).take(10) 

[('p', 5),
 ('l', 10),
 ('s', 12),
 ('c', 7),
 ('d', 8),
 ('y', 7),
 ('h', 13),
 ('i', 22),
 ('?', 3),
 ('r', 10)]

### aggregate
This function requires a null and start value. 
It operates in two levels. The first aggregates within partitions, the second aggregates across partitions. 
The start value will be used at both aggregation levels

In [16]:
nums = sc.parallelize(range(1,31), 6)

# sum the greatest values per partition

def max_func(left, right):
  return max(left, right)

def add_func(left, right):
  return left + right

values_per_part = nums.mapPartitions(lambda records: [max(records)]).collect()
print(f'greatest values per partition: {values_per_part}')

aggr_result = nums.aggregate(0, max_func, add_func)

print(f'summation of greatest values per partition: {aggr_result}')

greatest values per partition: [5, 10, 15, 20, 25, 30]
summation of greatest values per partition: 105


## Joins

### cogroup
groups together two key–value RDDs. 
This joins the given values by key. This is effectively just a group-based join on an RDD.

In [17]:
import random
distinct_chars = words.flatMap(lambda word: word.lower()).distinct()
char_rdd_1 = distinct_chars.map(lambda c: (c, random.random()))
char_rdd_2 = distinct_chars.map(lambda c: (c, random.random()))
group = char_rdd_1.cogroup(char_rdd_2)
print('cogroup result:')
print("\n".join([str(r) for r in group.take(5)]))

def pretty_output(row):
    key = row[0]
    groups = row[1] # groups consist of nested iterables
    values = [str(v) for group in groups for v in group]
    return f'{key}: {values}'
print('\ncogroup result interpreted:')
group.map(pretty_output).take(5)

cogroup result:
('p', (<pyspark.resultiterable.ResultIterable object at 0x7ff1b5015610>, <pyspark.resultiterable.ResultIterable object at 0x7ff1b4c28210>))
('l', (<pyspark.resultiterable.ResultIterable object at 0x7ff1b4c28110>, <pyspark.resultiterable.ResultIterable object at 0x7ff1b50156d0>))
('s', (<pyspark.resultiterable.ResultIterable object at 0x7ff1b5015750>, <pyspark.resultiterable.ResultIterable object at 0x7ff1b5015a90>))
('c', (<pyspark.resultiterable.ResultIterable object at 0x7ff1b5015bd0>, <pyspark.resultiterable.ResultIterable object at 0x7ff1b5015c50>))
('y', (<pyspark.resultiterable.ResultIterable object at 0x7ff1b4ff0890>, <pyspark.resultiterable.ResultIterable object at 0x7ff1b4ff0450>))

cogroup result interpreted:


["p: ['0.42341080152974664', '0.024821088908647093']",
 "l: ['0.4372471723329814', '0.8387213090509738']",
 "s: ['0.6191467183510945', '0.3837784985338819']",
 "c: ['0.457770202543789', '0.34687334516793533']",
 "y: ['0.17051305281148943', '0.857335437314308']"]

### innerJoin
joins allow setting the number of output partitions.
All join types (inner, fullOuter, leftOuter and rightOuter) follow the same pattern

In [21]:
kv_chars = words.distinct().map(lambda c: (c, random.random()))
output_partitions = 10
print(kv_chars.count())
print(kv_chars.join(kv_chars).count())
print(kv_chars.join(kv_chars, output_partitions).count())

48
48
48


### zip
Groups two rdds with the same number of partitions and cardinality

In [22]:
range1 = sc.parallelize(range(10), 2)
range2 = sc.parallelize(range(10, 20), 2)
range1.zip(range2).collect()

[(0, 10),
 (1, 11),
 (2, 12),
 (3, 13),
 (4, 14),
 (5, 15),
 (6, 16),
 (7, 17),
 (8, 18),
 (9, 19)]

## Controlling Partitions

### coalesce
Collapses partitions, trying to reduce the amount of data moved across nodes. However, partitions may end up skewed.

In [14]:
partsSizes = words.mapPartitions(lambda records: [sum(1 for _ in records)]).collect()
print('original number partitions with size: ', partsSizes)
# collapse from 3 to 2 partitions
coal_words = words.coalesce(2)
coal_partsSizes = coal_words.mapPartitions(lambda records: [sum(1 for _ in records)]).collect()
print('partitions with size after coalesce: ', coal_partsSizes)

original number partitions with size:  [20, 20, 22]
partitions woth size after coalesce:  [20, 42]


### repartition
Allows a repartition in th data up or down but performs a shuffle across nodes in the process.

In [19]:
partsSizes = words.mapPartitions(lambda records: [sum(1 for _ in records)]).collect()
print('original number partitions with size: ', partsSizes)
# collapse from 3 to 2 partitions
collapsed_words = words.repartition(2)
collapsed_partsSizes = collapsed_words.mapPartitions(lambda records: [sum(1 for _ in records)]).collect()
print('partitions with size after collapsing with repartition: ', collapsed_partsSizes)

original number partitions with size:  [20, 20, 22]
partitions with size after collapsing with repartition:  [32, 30]


In [22]:
exp_words = words.repartition(5)
exp_partsSizes = exp_words.mapPartitions(lambda records: [sum(1 for _ in records)]).collect()
print('partitions with size after expanding with repartition: ', exp_partsSizes)

partitions with size after expanding with repartition:  [0, 10, 20, 20, 12]


### Custom Partitioning

Custom partitioning allows fine grained control over how data is distributed. It may be usefull to handle data skew or data movement.

The following example loads a list of customers. There are two customer that need to be put in their own partitions are they may fill up the memory due to their number of records.

In [26]:
# Load data
df = spark.read.option("header", "true").option("inferSchema", "true")\
  .csv("/notebooks/online-retail-dataset.csv")
print('Schema: ')
df.printSchema()
# lets collapse to simulate data skew
retail = df.coalesce(10).rdd
partsSizes = retail.mapPartitions(lambda records: [sum(1 for _ in records)]).collect()
print('Retail RDD partitions with size:', partsSizes)

[Stage 31:>                                                       (0 + 11) / 11]                                                                                

Schema: 
root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)





Retail RDD partitions with size: [50537, 50905, 50816, 50752, 51077, 50843, 50652, 49917, 49654, 86756]


                                                                                

A partitioner will be used to put two specific customers in the same partition:

In [42]:
# partitioner
def partitioning_func(key):
  import random
  if key == 17850 or key == 12583:
    # records which belong to these customer go to partition 0
    return 0
  else:
    # any other customer's records go to whatever other partition number from 1 to 3
    return random.randint(1,3)

# convert retailt RDD to a key-value RDD using the customer number as key
kv_retail = retail.keyBy(lambda row: row[6])

repart_kv_retail = kv_retail.partitionBy(4, partitioning_func)

partsSizes = repart_kv_retail.mapPartitionsWithIndex(lambda idx, records: [f'{idx}: {sum(1 for _ in records)}']).collect()
print('Retail RDD partitions with size:', partsSizes)



Retail RDD partitions with size: ['0: 563', '1: 180613', '2: 180053', '3: 180680']


                                                                                