In [1]:
from pyspark import SparkContext, SparkConf, StorageLevel

- sparkContext - spark上下文管理对象，在spark shell环境中会自动创建这个对象
- local[N] - 本地运行模式(单机), 在单机模式下完成开发与测试，local[N]表示使用N个线程运行，
- local[*] - 线程数量与与主机cpu数量(逻辑)一致

In [2]:
conf = SparkConf().setMaster("local[4]").setAppName("myApp")
sc = SparkContext(conf=conf)

# 创建 RDD
- rdd 为弹性分布式数据集，不考虑分布，与列表、集合性质差不多
- rdd会将一个数据集分布在多个区上
- rdd数据集是非结构数据

In [4]:
rdd_demo = sc.parallelize([1, 2, 3, 4, 5, 6])

- collect - 行动操作, collect返回元素列表，只能在元素数量很少的情况下使用

In [6]:
rdd_demo.collect()

[1, 2, 3, 4, 5, 6]

In [7]:
rdd_demo = sc.parallelize(['a', 'b', 'c', 'd'])

In [17]:
rdd_demo.collect()

[('a', 1), ('b', 2), ('c', 3), ('d', 4)]

In [8]:
rdd_demo = sc.parallelize({'a': 1, 'b': 2, 'c': 3, 'd': 4})

In [18]:
rdd_demo.collect()

[('a', 1), ('b', 2), ('c', 3), ('d', 4)]

In [9]:
rdd_demo = sc.parallelize({'a', 'b', 3, 4})

In [19]:
rdd_demo.collect()

[('a', 1), ('b', 2), ('c', 3), ('d', 4)]

In [10]:
rdd_demo = sc.parallelize([('a', 1), ('b', 2), ('c', 3), ('d', 4)])

In [20]:
rdd_demo.collect()

[('a', 1), ('b', 2), ('c', 3), ('d', 4)]

# RDD 基本操作：转换操作

## union

In [12]:
rdd1 = sc.parallelize([1, 2, 3, 4, 5, 6])
rdd2 = sc.parallelize([10, 20, 30, 40, 55, 66])

In [21]:
rdd3 = rdd1.union(rdd2)

In [22]:
rdd3.collect()

[1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 55, 66]

## map

In [32]:
rdd4 = rdd1.map(lambda x: x**2)

In [33]:
rdd4.collect()

[1, 4, 9, 16, 25, 36]

In [28]:
def func(x):
    print(x)
    if x % 2 == 0:
        x *= 10
    else:
        x += 2
    return x

In [30]:
rdd5 = rdd1.map(func)

In [31]:
rdd5.collect()

[3, 20, 5, 40, 7, 60]

In [34]:
rdd5.countByValue()

defaultdict(int, {3: 1, 20: 1, 5: 1, 40: 1, 7: 1, 60: 1})

## filter

In [36]:
rdd6 = rdd1.filter(lambda x: x % 2 == 0)

In [37]:
rdd6.collect()

[2, 4, 6]

# RDD 基本操作：行动操作

## first

In [40]:
rdd1 = sc.parallelize([1, 2, 3, 4, 5, 3, 4, 2, 7, 8])

In [41]:
rdd1.first()

1

## top 按排序规则

In [42]:
rdd1.top(5)

[8, 7, 5, 4, 4]

## take 按索引

In [43]:
rdd1.take(5)

[1, 2, 3, 4, 5]

## count 

In [44]:
rdd1.count()

10

## countByValue

In [46]:
rdd1.countByValue()

defaultdict(int, {1: 1, 2: 2, 3: 2, 4: 2, 5: 1, 7: 1, 8: 1})

## reduce(func)

In [48]:
rdd1.reduce(lambda x, y: x + y)

39

In [49]:
rdd1.reduce(lambda x, y: x * y)

161280

In [51]:
rdd1.reduce(lambda x, y: x - y)  # 顺序不定，减不可靠

11

## foreach(func)

In [52]:
def func(x):
    print(x)

In [53]:
rdd1.foreach(func)  # 显示在命令界面

# RDD持久化

转化操作是一种惰性操作，只有行动操作时才会执行操作
非持久化状态下，每次执行操作，都会执行转换行为，可以通过持久化，避免每次行动操作都需要进行转化操作

persist- rdd.persist(storageLevel)
- StorageLevel.MEMORY_ONLY
- StorageLevel.DISK_ONLY
- StorageLevel.MEMORY_AND_DISK

In [54]:
def func(x):
    print(x)
    return x * 2

In [55]:
rdd_demo = sc.parallelize([1, 2, 3, 4, 4, 3, 5, 1, 2, 3, 4, 4, 3, 5])

In [57]:
rdd2 = rdd_demo.map(func)

In [58]:
rdd2.collect()

[2, 4, 6, 8, 8, 6, 10, 2, 4, 6, 8, 8, 6, 10]

In [59]:
rdd2.persist()

PythonRDD[35] at collect at <ipython-input-58-83517eaf6d43>:1

In [64]:
rdd2.collect()

[2, 4, 6, 8, 8, 6, 10, 2, 4, 6, 8, 8, 6, 10]

In [65]:
rdd2.unpersist()

PythonRDD[35] at collect at <ipython-input-58-83517eaf6d43>:1

In [66]:
rdd2.collect()

[2, 4, 6, 8, 8, 6, 10, 2, 4, 6, 8, 8, 6, 10]

# 数值型RDD
- 数值型RDD支持求和，求平均，求方差等等数学计算

In [68]:
rdd2.count()

14

In [69]:
rdd2.countByValue()

defaultdict(int, {2: 2, 4: 2, 6: 4, 8: 4, 10: 2})

In [70]:
rdd2.variance()

6.20408163265306

In [71]:
rdd2.stats()

(count: 14, mean: 6.285714285714286, stdev: 2.4907993963089563, max: 10.0, min: 2.0)

In [73]:
# 将rdd中误差超过标准方差的元素剔除
mean = rdd2.mean()
std = rdd2.stdev()
rdd3 = rdd2.filter(lambda x: abs(x - mean) <= std)

In [74]:
rdd3.collect()

[4, 6, 8, 8, 6, 4, 6, 8, 8, 6]

In [75]:
rdd4 = sc.parallelize(range(10**7), 4)

In [76]:
print("进行转换操作")
new_rdd = rdd_demo.filter(lambda x: x % 2 == 0)

进行转换操作


In [77]:
print("执行操作")
res = new_rdd.variance()
print(res)

执行操作
0.8888888888888888
