# 键值对RDD 

In [1]:
from pyspark import SparkContext
sc = SparkContext('local', 'Pair RDD Programming')

## RDD 创建 
构建pair rdd的方式在不同语言中有所不同。<br>
Python中，为了让提取键之后的数据能够在函数中应用，需要返回一个由二元组组成的RDD。<br>

In [5]:
list = ["Hadoop", "Spark", "Python", "Spark", "MachineLearning", "MachineLearning"]
rdd = sc.parallelize(list)
pairRDD = rdd.map(lambda word : (word, 1))
pairRDD.collect()

[('Hadoop', 1),
 ('Spark', 1),
 ('Python', 1),
 ('Spark', 1),
 ('MachineLearning', 1),
 ('MachineLearning', 1)]

## 聚合操作 

### reduceByKey(func)
reduceByKey(func)的功能是，使用func函数合并具有相同键的值。<br>
例如如下代码，对具有相同key的的value求合

In [25]:
list = ["Hadoop", "Spark", "Python", "Spark", "MachineLearning", "MachineLearning"]
rdd = sc.parallelize(list)
pairRDD = rdd.map(lambda word : (word, 1))
pairRDD.reduceByKey(lambda a, b : a + b).collect()

[('Hadoop', 1), ('Spark', 2), ('Python', 1), ('MachineLearning', 2)]

使用reduceByKey与mapValues求key对应的均值

In [7]:
rdd = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
# 转换为pairRDD形式
pairRDD = rdd.map(lambda t: (t[0], t[1]))
# mapValues(func)是对每个value进行操作
sum_num = pairRDD.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
sum_num.collect()

[('Hadoop', (0, 1)), ('Spark', (5, 2)), ('Python', (1, 1))]

In [8]:
sum_num.map(lambda x: x[1][0] / x[1][1]).collect()

[0.0, 2.5, 1.0]

### groupByKey()
groupByKey()的功能是，对具有相同键的值进行分组。
例如对[('Hadoop', 1),
 ('Spark', 1),
 ('Python', 1),
 ('Spark', 1),
 ('MachineLearning', 1),
 ('MachineLearning', 1)]
 进行分组的结果是[('Hadoop', (1)), ('Spark', (1, 1)), ('Python', (1)), ('MachineLearning', (1, 1))]

In [26]:
list = ["Hadoop", "Spark", "Python", "Spark", "MachineLearning", "MachineLearning"]
rdd = sc.parallelize(list)
pairRDD = rdd.map(lambda word : (word, 1))
pairRDD.groupByKey().collect()

[('Hadoop', <pyspark.resultiterable.ResultIterable at 0x7feb00175668>),
 ('Spark', <pyspark.resultiterable.ResultIterable at 0x7feb00175dd8>),
 ('Python', <pyspark.resultiterable.ResultIterable at 0x7feb001750f0>),
 ('MachineLearning',
  <pyspark.resultiterable.ResultIterable at 0x7feb00175cf8>)]

### 连接
join是内连接

In [13]:
rdd_1 = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
rdd_2 = sc.parallelize([('Hadoop', 0), ('Spark', 3)])
rdd_1.join(rdd_2).collect()

[('Spark', (3, 3)), ('Spark', (2, 3)), ('Hadoop', (0, 0))]

leftOuterJoin() 是左外连接

In [15]:
rdd_1 = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
rdd_2 = sc.parallelize([('Hadoop', 0), ('Spark', 3)])
rdd_1.leftOuterJoin(rdd_2).collect()

[('Spark', (3, 3)),
 ('Spark', (2, 3)),
 ('Python', (1, None)),
 ('Hadoop', (0, 0))]

rightOuterJoin() 是右外连接

In [18]:
rdd_1 = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
rdd_2 = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Java', 3)])
rdd_1.rightOuterJoin(rdd_2).collect()

[('Spark', (3, 3)), ('Spark', (2, 3)), ('Java', (None, 3)), ('Hadoop', (0, 0))]

### 排序
我们使用sortByKey()对键值对RDD进行排序操作。

In [23]:
rdd = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
# ascending = True, 升序
# ascending = False, 降序
rdd.sortByKey(ascending=False).collect()

[('Spark', 3), ('Spark', 2), ('Python', 1), ('Hadoop', 0)]

In [24]:
# 同样可以传递函数
# 按照字符串的顺序排序
rdd.sortByKey(ascending=False, keyfunc = lambda x: str(x)).collect()

[('Spark', 3), ('Spark', 2), ('Python', 1), ('Hadoop', 0)]

### keys()
keys()会把键值对RDD中的key返回形成一个新的RDD

In [27]:
list = ["Hadoop", "Spark", "Python", "Spark", "MachineLearning", "MachineLearning"]
rdd = sc.parallelize(list)
pairRDD = rdd.map(lambda word : (word, 1))
pairRDD.keys().collect()

['Hadoop', 'Spark', 'Python', 'Spark', 'MachineLearning', 'MachineLearning']

### values()
values()会把键值对RDD中的value返回形成一个新的RDD

In [2]:
list = ["Hadoop", "Spark", "Python", "Spark", "MachineLearning", "MachineLearning"]
rdd = sc.parallelize(list)
pairRDD = rdd.map(lambda word : (word, 1))
pairRDD.values().collect()

[1, 1, 1, 1, 1, 1]

In [4]:
rdd = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
pairRDD = rdd.map(lambda t: (t[0], t[1]))
pairRDD.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).map(lambda x: x[1][0] / x[1][1]).collect()

[0.0, 2.5, 1.0]

## 行动操作

## countByKey() 对每个键的值的个数进行计数

In [26]:
rdd = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
pairRDD = rdd.map(lambda t: (t[0], t[1]))
pairRDD.countByKey()

defaultdict(int, {'Hadoop': 1, 'Spark': 2, 'Python': 1})

## lookup() 返回给定键对应的值

In [28]:
rdd = sc.parallelize([('Hadoop', 0), ('Spark', 3), ('Python', 1), ('Spark', 2)])
pairRDD = rdd.map(lambda t: (t[0], t[1]))
pairRDD.lookup('Spark')

[3, 2]