# Create RDD
One of the easiest ways to get RDDs is from an existing DataFrame or Dataset

In [1]:
rdd1 = spark.range(500).rdd
rdd1

MapPartitionsRDD[5] at javaToPython at NativeMethodAccessorImpl.java:0

However, by default, records are of type Row

In [2]:
records = rdd1.take(5)
print(f'records: {records}')
print(f'record type: {type(records[0])}')

[Stage 0:>                                                          (0 + 1) / 1]

records: [Row(id=0), Row(id=1), Row(id=2), Row(id=3), Row(id=4)]
record type: <class 'pyspark.sql.types.Row'>


                                                                                

So it probably needs to be mapped to the correct data type:

In [3]:
spark.range(500).rdd.map(lambda row: row[0])

PythonRDD[13] at RDD at PythonRDD.scala:53

# Create an RDD from a local collection

In [4]:
myCollection = '''Please could you stop the noise?
I'm trying to get some rest
From all the unborn chicken
Voices in my head
What's that?
(I may be paranoid, but not an android)
What's that?
(I may be paranoid, but not an android)
When I am king
You will be first against the wall
With your opinion
Which is of no consequence at all'''.replace('\n', ' ').split(' ')
words = spark.sparkContext.parallelize(myCollection, 2) # the seconds parameter specifies the number of partitions
words

ParallelCollectionRDD[14] at readRDDFromFile at PythonRDD.scala:262

# Read file using SparkContext

Reading individual lines from text files:

In [5]:
tweets = spark.sparkContext.textFile("/work/data/covid-tweets.json")
tweets.count()

                                                                                

78011

Reading full files

In [6]:
tweet_files = spark.sparkContext.wholeTextFiles("/work/data/covid-tweets")
tweet_files.count()

4

## Transformations

### distinct

In [7]:
d_words = words.distinct()
print(d_words.collect())
print(f'count: {d_words.count()}')

[Stage 3:>                                                          (0 + 2) / 2]

['Please', 'stop', 'noise?', 'trying', 'rest', 'in', 'head', "What's", 'may', 'but', 'an', 'android)', 'When', 'am', 'king', 'against', 'opinion', 'Which', 'is', 'of', 'no', 'at', 'could', 'you', 'the', "I'm", 'to', 'get', 'some', 'From', 'all', 'unborn', 'chicken', 'Voices', 'my', 'that?', '(I', 'be', 'paranoid,', 'not', 'I', 'You', 'will', 'first', 'wall', 'With', 'your', 'consequence']
count: 48


                                                                                

### filter
select a subset

In [8]:
def myFilterFunc(word):
    # filter out each word not starting with 's'
    return word.lower().startswith('a')

f_words = words.filter(myFilterFunc)
f_words.collect()

['all', 'an', 'android)', 'an', 'android)', 'am', 'against', 'at', 'all']

### map
one-to-one transformation

In [9]:
def myMapFunc(word):
    # map word to a tuple of a lower case first letter and word
    word = word.lower()
    return word[0], word
m_words = words.map(myMapFunc)
m_words.take(5)

[('p', 'please'), ('c', 'could'), ('y', 'you'), ('s', 'stop'), ('t', 'the')]

### flatMap
one-to-many transformation

In [10]:
print(f'initial number of records: {words.count()}')
letters = words.flatMap(lambda word: list(word.lower())) # map a word to a series of lower case letters
print(f'some letters:{letters.take(20)}')
print(f'final number of records: {letters.count()}')

initial number of records: 62
some letters:['p', 'l', 'e', 'a', 's', 'e', 'c', 'o', 'u', 'l', 'd', 'y', 'o', 'u', 's', 't', 'o', 'p', 't', 'h']
final number of records: 253


### sortBy

In [11]:
# sort by word length in descending length order
s_words = words.sortBy(lambda word: len(word) * -1)
s_words.take(10)

['consequence',
 'paranoid,',
 'paranoid,',
 'android)',
 'android)',
 'chicken',
 'against',
 'opinion',
 'Please',
 'noise?']

### randomSplit
Create a given number of RDDs containing a number of elements based on weights

In [12]:
split_rdds = words.randomSplit([0.2, 0.8]) # two RDDs with different sizes based on the given weights
for i, split_rdd in enumerate(split_rdds):
    print(f'{i}: {split_rdd.count()}')

0: 12
1: 50


## Actions
Actions either collect data to the driver or write to an external data source

### reduce
“reduce” an RDD of any kind of value to one value

In [13]:
spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y)

210

Example: get the longest word

In [14]:
def wordLengthReducer(leftWord, rightWord):
  if len(leftWord) > len(rightWord):
    return leftWord
  return rightWord

words.reduce(wordLengthReducer)

'consequence'

### count, collect and take
Too many examples by now

### countByValue

In [15]:
words.countByValue()

defaultdict(int,
            {'Please': 1,
             'could': 1,
             'you': 1,
             'stop': 1,
             'the': 3,
             'noise?': 1,
             "I'm": 1,
             'trying': 1,
             'to': 1,
             'get': 1,
             'some': 1,
             'rest': 1,
             'From': 1,
             'all': 2,
             'unborn': 1,
             'chicken': 1,
             'Voices': 1,
             'in': 1,
             'my': 1,
             'head': 1,
             "What's": 2,
             'that?': 2,
             '(I': 2,
             'may': 2,
             'be': 3,
             'paranoid,': 2,
             'but': 2,
             'not': 2,
             'an': 2,
             'android)': 2,
             'When': 1,
             'I': 1,
             'am': 1,
             'king': 1,
             'You': 1,
             'will': 1,
             'first': 1,
             'against': 1,
             'wall': 1,
             'With': 1,
             'your'

### first

In [16]:
words.first()

'Please'

### takeOrdered, top and takeSample

In [17]:
print('take(5): ', words.take(5))
print('takeOrdered: ', words.takeOrdered(5))
print('top(5): ', words.top(5))
withReplacement = True
numberToTake = 6
randomSeed = 100
print('takeSample: ', words.takeSample(withReplacement, numberToTake, randomSeed))

take(5):  ['Please', 'could', 'you', 'stop', 'the']
takeOrdered:  ['(I', '(I', 'From', 'I', "I'm"]
top(5):  ['your', 'you', 'will', 'wall', 'unborn']
takeSample:  ['trying', "What's", 'first', 'but', 'rest', 'could']


## Saving Files

### saveAsTextFile

In [18]:
words.saveAsTextFile("file:/data/out/paranoid_android3")

### saveAsObjectFile
Saves the file as a Hadoop sequence file

In [19]:
# the sequence file consist of key-value pairs, so words first is mapped to that format
words.map(lambda x: (None, x)).saveAsSequenceFile("/data/out/paranoid_android_seqfile2")

## Caching and Checkpointing

In [20]:
lower_words = words.map(lambda w: w.lower()).cache()
lower_words.count()
# lower_words is cached and filter will work on the cached results
lower_words.filter(lambda w: w.startswith('a')).count()

9

In [21]:
# checkpointing in an HDFS path
spark.sparkContext.setCheckpointDir("hdfs://namenode:9000/checkpoint")
words.checkpoint()

In [22]:
spark.sparkContext.parallelize(range(100000000)).count()

                                                                                

100000000