In [1]:
import random

In [2]:
import pyspark

## 生成一个1000个元素组成的随机字符数组

In [3]:
ls = [chr(random.randint(65,71)) 
     for i in range(1000)]

In [4]:
rdd = pyspark.SparkContext().parallelize(ls)

## 构建一个map，key是随机字符，value是随机整数
## 进行缓存

In [5]:
rdd2 = rdd.map(lambda x :(x,random.randint(0,10000))).cache()

In [6]:
print(rdd2.take(30))

[('B', 6769), ('F', 8230), ('E', 3325), ('B', 148), ('F', 1956), ('D', 4230), ('A', 5862), ('D', 8148), ('A', 9723), ('D', 9276), ('E', 5407), ('G', 5177), ('F', 7832), ('B', 157), ('D', 5441), ('D', 2184), ('E', 3121), ('E', 4381), ('F', 7427), ('B', 4172), ('G', 4421), ('A', 7796), ('E', 1898), ('A', 2690), ('B', 5578), ('F', 2669), ('E', 2744), ('E', 5593), ('F', 2173), ('C', 5830)]


## 进行groupByKey操作

In [7]:
grdd = rdd2.groupByKey()

## groupByKey会生成一个（key,[value迭代器]）组成的结构

In [8]:
grdd.take(1)

[('B', <pyspark.resultiterable.ResultIterable at 0x1acef9c5160>)]

## 再来一次Map，这次执行获取最大值和最小值的操作

In [9]:
def mymap(gmap):
    return (gmap[0],(min(gmap[1]),max(gmap[1])))

In [10]:
grdd.map(lambda g: mymap(g)).collect()

[('B', (104, 9917)),
 ('C', (63, 9979)),
 ('F', (56, 9958)),
 ('A', (10, 9790)),
 ('E', (168, 9929)),
 ('D', (30, 9925)),
 ('G', (95, 9954))]

## 传统方式一、计算max和min，需要进行两次计算

In [11]:
maxval = rdd2.reduceByKey(lambda x,y:max(x,y)).collect()
minval = rdd2.reduceByKey(lambda x,y:min(x,y)).collect()

In [12]:
print(maxval)
print(minval)

[('B', 9917), ('C', 9979), ('F', 9958), ('A', 9790), ('E', 9929), ('D', 9925), ('G', 9954)]
[('B', 104), ('C', 63), ('F', 56), ('A', 10), ('E', 168), ('D', 30), ('G', 95)]


## 设计好map结构，也可以一次性算完

In [14]:
rdd2.map(lambda x : (x[0],(x[1],x[1])))\
.reduceByKey(lambda x,y:(max(x[0],y[0]),min(x[1],y[1]))).collect()

[('B', (9917, 104)),
 ('C', (9979, 63)),
 ('F', (9958, 56)),
 ('A', (9790, 10)),
 ('E', (9929, 168)),
 ('D', (9925, 30)),
 ('G', (9954, 95))]