In [5]:
myCollection = '''Please could you stop the noise?
I'm trying to get some rest
From all the unborn chicken
Voices in my head
What's that?
(I may be paranoid, but not an android)
What's that?
(I may be paranoid, but not an android)
When I am king
You will be first against the wall
With your opinion
Which is of no consequence at all'''.replace('\n', ' ').split(' ')
words = spark.sparkContext.parallelize(myCollection, 3) # the second parameter specifies the number of partitions
words

ParallelCollectionRDD[8] at readRDDFromFile at PythonRDD.scala:262

## Broadcast variables

Create a broadcast variable

In [6]:
# define a regular variable, it will be used to enrich data from the RDD
words_weights = {"paranoid,":1000, "android)":750, "chicken":500, "king":250}

#convert to broadcast variable
broadcast_weights = spark.sparkContext.broadcast(words_weights)

# get broadcast variable's value:
print('Value: ', broadcast_weights.value)

Value:  {'paranoid,': 1000, 'android)': 750, 'chicken': 500, 'king': 250}


Reference broadcast variable from an executor function

In [7]:
def to_weighted_word(word):
    weight = broadcast_weights.value.get(word.lower(), 1)
    return weight, word

w_words = words.map(to_weighted_word)
# sort in descending weight order
w_words.sortBy(lambda wordPair: -wordPair[0]).take(10)

[(1000, 'paranoid,'),
 (1000, 'paranoid,'),
 (750, 'android)'),
 (750, 'android)'),
 (500, 'chicken'),
 (250, 'king'),
 (1, 'Please'),
 (1, 'could'),
 (1, 'you'),
 (1, 'stop')]

## Accumulators

This example will count flights to and from China using an accumulator.
First, load example fights data:

In [12]:
flights = spark.read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv("/work/data/2010-summary.csv")
print('Schema: ')
flights.printSchema()

Schema: 
root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



Create an unnamed accumulator

In [13]:
acc_china = spark.sparkContext.accumulator(0)

Use accumulator

In [14]:
def acc_china_func(flight_row):
  dest = flight_row[0]
  origin = flight_row[1]
  if "China" in (origin, dest):
    acc_china.add(flight_row[2])
    
flights.foreach(lambda flight_row: acc_china_func(flight_row))

Show accumulator final value

In [15]:
acc_china.value

953