### Creating an RDD from a Collection


In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext


In [3]:
spark = SparkSession.builder.appName('ex02').getOrCreate()

In [4]:
# data list
data =  [
("fox", 6), ("dog", 5), ("fox", 3), ("dog", 8),
("cat", 1), ("cat", 2), ("cat", 3), ("cat", 4)
]

In [6]:
rdd = spark.sparkContext.parallelize(data)

In [7]:
rdd.collect()

[('fox', 6),
 ('dog', 5),
 ('fox', 3),
 ('dog', 8),
 ('cat', 1),
 ('cat', 2),
 ('cat', 3),
 ('cat', 4)]

In [8]:
rdd.count()

8

### Aggregating and Merging Values of Keys

In [9]:
sum_per_key = rdd.reduceByKey(lambda x, y: x+y)

In [10]:
sum_per_key.collect()

[('fox', 9), ('dog', 13), ('cat', 10)]

### Filtering an RDD’s Elements

In [11]:
sum_filtered = sum_per_key.filter(lambda x: x[1] > 9)

In [12]:
sum_filtered.collect()

[('dog', 13), ('cat', 10)]

### Grouping Similar Keys

In [13]:
grouped = rdd.groupByKey()
grouped.collect()

[('fox', <pyspark.resultiterable.ResultIterable at 0x26b67da67a0>),
 ('dog', <pyspark.resultiterable.ResultIterable at 0x26b67d022c0>),
 ('cat', <pyspark.resultiterable.ResultIterable at 0x26b67d02320>)]

In [18]:
list(grouped.collect()[0][1])

[6, 3]

In [27]:
# list(v) converts v as a ResultIterable into a list
grouped.map(lambda x : (x[0], list(x[1])) ).collect()

[('fox', [6, 3]), ('dog', [5, 8]), ('cat', [1, 2, 3, 4])]

### Grouping Similar Keys

In [28]:
grouped = rdd.groupByKey()
grouped.collect()

[('fox', <pyspark.resultiterable.ResultIterable at 0x26b69b5a530>),
 ('dog', <pyspark.resultiterable.ResultIterable at 0x26b69b5a4a0>),
 ('cat', <pyspark.resultiterable.ResultIterable at 0x26b69b5a5f0>)]

In [29]:
grouped.map(lambda x : (x[0], list(x[1])) ).collect()

[('fox', [6, 3]), ('dog', [5, 8]), ('cat', [1, 2, 3, 4])]