In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("MyLearn")
sc = SparkContext(conf = conf)

In [2]:
# 1、map
names = ["张无忌", "赵敏", "周芷若"]
listRDD = sc.parallelize(names)

temp = listRDD.map(lambda name : "Helllo" + name)

temp.foreach(lambda strs : print(strs)) # foreach是在Executor中执行
for strs in temp.collect():
    print(strs)
print(temp.collect())

Helllo张无忌
Helllo赵敏
Helllo周芷若
['Helllo张无忌', 'Helllo赵敏', 'Helllo周芷若']


In [3]:
# 2、flatMap：
names = ["张无忌", "赵敏", "周芷若"]
listRDD = sc.parallelize(names)

# flatMap会自动将元组打开，元组中的每个元素添加进列表中；
# temp = listRDD.flatMap(lambda name : (name, "Hello" + name)) 

temp = listRDD.flatMap(lambda name : ["Hello" + name]) # flatMap要求得到的列表类型

temp.foreach(lambda strs : print(strs)) # foreach是在Executor中执行
for strs in temp.collect():
    print(strs)
print(temp.collect())

Hello张无忌
Hello赵敏
Hello周芷若
['Hello张无忌', 'Hello赵敏', 'Hello周芷若']


In [7]:
names = ("张无忌 赵敏", "宋青书 周芷若", "刘德华", "张学友")
listRDD = sc.parallelize(names)

temp = listRDD.flatMap(lambda name: name.split(" ")).map(lambda name: "Hello" + name)

temp.foreach(lambda strs : print(strs)) # foreach是在Executor中执行
for strs in temp.collect():
    print(strs)
print(temp.collect())

Hello张无忌
Hello赵敏
Hello宋青书
Hello周芷若
Hello刘德华
Hello张学友
['Hello张无忌', 'Hello赵敏', 'Hello宋青书', 'Hello周芷若', 'Hello刘德华', 'Hello张学友']


In [10]:
# 3、filter：
names = ("张无忌 赵敏", "宋青书 周芷若", "刘德华", "张学友")
listRDD = sc.parallelize(names)

temp = listRDD.filter(lambda name: name.startswith("张"))

temp.foreach(lambda strs : print(strs)) # foreach是在Executor中执行
for strs in temp.collect():
    print(strs)
print(temp.collect())

张无忌 赵敏
张学友
['张无忌 赵敏', '张学友']


In [11]:
# 4、groupByKey：
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
groupByRdd = mapRdd.groupByKey()
print(groupByRdd.collect())
print(groupByRdd.map(lambda x:{x[0]:list(x[1])}).collect())

[('hello', <pyspark.resultiterable.ResultIterable object at 0x0000015BFEB201D0>), ('spark', <pyspark.resultiterable.ResultIterable object at 0x0000015BFEB20208>), ('world', <pyspark.resultiterable.ResultIterable object at 0x0000015BFEB208D0>)]
[{'hello': [1, 1, 1]}, {'spark': [1]}, {'world': [1, 1]}]


In [13]:
# 5、reduceByKey：
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
reduceByKeyRdd = mapRdd.reduceByKey(lambda a,b:a+b) # 意思是 相同key的value进行操作
print(reduceByKeyRdd.collect())

[('hello', 3), ('spark', 1), ('world', 2)]


In [16]:
# 6、sortByKey：
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRDD = rdd.flatMap(lambda line: line.split(" ")).map(lambda x: (x, 1))
reduceByKeyRdd = mapRdd.reduceByKey(lambda a, b: a + b)
print(reduceByKeyRdd.sortByKey(False).collect())
print(reduceByKeyRdd.map(lambda x:(x[1],x[0])).sortByKey(False).map(lambda x:(x[1],x[0])).collect())

[('world', 2), ('spark', 1), ('hello', 3)]
[('hello', 3), ('world', 2), ('spark', 1)]


In [17]:
a = sc.parallelize([1,2,3])
b = sc.parallelize([3,4,5])
a.union(b).collect()

[1, 2, 3, 3, 4, 5]

In [18]:
a = sc.parallelize([1, 2, 3])
b = sc.parallelize([3, 4, 2])
a.union(b).distinct().collect()

[2, 4, 1, 3]

In [21]:
a = sc.parallelize([("A", "a1"), ("C", "c1"), ("D", "d1"), ("F", "f1"), ("F", "f2")])
b = sc.parallelize([("A", "a2"), ("C", "c2"), ("C", "c3"), ("E", "e1")])
print(a.fullOuterJoin(b).collect())

[('C', ('c1', 'c2')), ('C', ('c1', 'c3')), ('A', ('a1', 'a2')), ('D', ('d1', None)), ('F', ('f1', None)), ('F', ('f2', None)), ('E', (None, 'e1'))]
