In [1]:
sc.master

u'local[*]'

# 1-1. Basic RDD "transform" -> 不會立刻執行

## parallelize 

In [2]:
intRDD = sc.parallelize([1, 2, 3, 5, 5])

In [3]:
intRDD.collect()

[1, 2, 3, 5, 5]

In [4]:
stringRDD = sc.parallelize(["A", "B", "A"])
stringRDD.collect()

['A', 'B', 'A']

## map 

In [5]:
intRDD.map(lambda x: x + 1).collect()

[2, 3, 4, 6, 6]

In [6]:
def addOne(x):
    return x + 1

intRDD.map(addOne).collect()

[2, 3, 4, 6, 6]

## filter 

In [7]:
intRDD.filter(lambda x: x > 3).collect()

[5, 5]

## distinct 

In [8]:
intRDD.distinct().collect()

[1, 2, 3, 5]

## randomSplit 

In [9]:
dataRDD = sc.parallelize(range(100))
sampleRDDs = dataRDD.randomSplit([0.6, 0.4])
sampleRDDs

[PythonRDD[11] at RDD at PythonRDD.scala:48,
 PythonRDD[12] at RDD at PythonRDD.scala:48]

In [10]:
len(sampleRDDs[0].collect())

63

# 1-2. Multiple RDD "transform"

In [11]:
intRDD1 = sc.parallelize([1, 2, 3, 5, 5])
intRDD2 = sc.parallelize([5, 6])
intRDD3 = sc.parallelize([2, 7])

## union 聯集

In [12]:
intRDD1.union(intRDD2).union(intRDD3).collect()

[1, 2, 3, 5, 5, 5, 6, 2, 7]

## intersection 交集

In [13]:
intRDD1.intersection(intRDD2).collect()

[5]

## substract 差集 

In [14]:
intRDD1.subtract(intRDD2).collect()

[1, 2, 3]

## cartesian 很恐怖! 少用 

In [15]:
intRDD1.cartesian(intRDD2).collect()

[(1, 5),
 (1, 6),
 (2, 5),
 (2, 6),
 (3, 5),
 (3, 6),
 (5, 5),
 (5, 6),
 (5, 5),
 (5, 6)]

# 1-3. Basic RDD "action" -> 會開始執行

## first, take

In [16]:
intRDD.first()

1

In [17]:
intRDD.take(2)

[1, 2]

## takeOrdered

In [18]:
testRDD = sc.parallelize([4, 3, 1, 2])

In [19]:
testRDD.takeOrdered(3)

[1, 2, 3]

In [20]:
testRDD.takeOrdered(3, key=lambda x: -x)

[4, 3, 2]

# 2-1. Basic Key-Value RDD "transform"

In [21]:
kvRDD1 = sc.parallelize([(3, 4), (3, 6), (5, 6), (1, 2)])

## keys

In [22]:
kvRDD1.keys().collect()

[3, 3, 5, 1]

## values 

In [23]:
kvRDD1.values().collect()

[4, 6, 6, 2]

## map, filter 

In [24]:
kvRDD1.map(lambda x: x[0]).collect()

[3, 3, 5, 1]

In [25]:
kvRDD1.filter(lambda x: x[0] < 5).collect()

[(3, 4), (3, 6), (1, 2)]

## mapValues 

In [26]:
kvRDD1.mapValues(lambda v: v * v).collect()

[(3, 16), (3, 36), (5, 36), (1, 4)]

## sortByKey 

In [27]:
kvRDD1.sortByKey().collect()

[(1, 2), (3, 4), (3, 6), (5, 6)]

In [28]:
kvRDD1.sortByKey(ascending=False).collect()

[(5, 6), (3, 4), (3, 6), (1, 2)]

## reduceByKey 

<img src="../images/reduceByKey.png" width="500" style="float: left;">

In [29]:
kvRDD1.reduceByKey(lambda x, y: x + y).collect()

[(1, 2), (3, 10), (5, 6)]

# 2-2. Multiple Key-Value RDD "transform" 

In [30]:
kvRDD1.collect()

[(3, 4), (3, 6), (5, 6), (1, 2)]

In [31]:
kvRDD2 = sc.parallelize([(3, 8)])

## join (by key) 

In [32]:
kvRDD1.join(kvRDD2).collect()

[(3, (4, 8)), (3, (6, 8))]

## leftOuterJoin

In [33]:
kvRDD1.leftOuterJoin(kvRDD2).collect()

[(1, (2, None)), (3, (4, 8)), (3, (6, 8)), (5, (6, None))]

## rightOuterJoin 

In [34]:
kvRDD1.rightOuterJoin(kvRDD2).collect()

[(3, (4, 8)), (3, (6, 8))]

## substructByKey 

In [35]:
kvRDD1.subtractByKey(kvRDD2).collect()

[(1, 2), (5, 6)]

# 2-3. Key-Value RDD "action"

In [36]:
kvRDD1.first()

(3, 4)

In [37]:
kvRDD1.take(2)

[(3, 4), (3, 6)]

## countByKey 

In [38]:
kvRDD1.countByKey()

defaultdict(int, {1: 1, 3: 2, 5: 1})

## collectAsMap

In [39]:
kvRDD1.collect()

[(3, 4), (3, 6), (5, 6), (1, 2)]

In [40]:
kvRDD1.collectAsMap() # 變成dict, (3, 4)消失

{1: 2, 3: 6, 5: 6}

## lookup 

In [41]:
kvRDD1.lookup(3)

[4, 6]

# 3. broadcast

In [42]:
peopleMap = ['Mike', 'Mary', 'Tiffany', 'Jenny']

In [43]:
peopleIds = sc.parallelize(range(4))

## Before broadcast 

In [44]:
peopleIds.map(lambda x: peopleMap[x]).collect()

['Mike', 'Mary', 'Tiffany', 'Jenny']

## Problem? => 一個map到的instance就要複製一次peopleMap! => 記憶體阿！

<img src="../images/before-broadcast.png" width="700" style="float: left;">

## After broadcast 

In [45]:
bpeopleMap = sc.broadcast(peopleMap)

In [46]:
peopleIds.map(lambda x: bpeopleMap.value[x]).collect()

['Mike', 'Mary', 'Tiffany', 'Jenny']

## 1個 worker node只存一份peopleMap在記憶體

<img src="../images/after-broadcast.png" width="700" style="float: left;">

# 4. accumulator 累加器

## !!! 在平行化過程不能任意改變資料

In [47]:
intRDD = sc.parallelize(range(10))

In [48]:
total = sc.accumulator(0.0)

In [49]:
num = sc.accumulator(0)

In [50]:
intRDD.foreach(lambda x: [total.add(x), num.add(1)])

In [51]:
total.value, num.value, (total.value / num.value)

(45.0, 10, 4.5)

# 5. persist 持久化

In [52]:
myRDD = sc.parallelize(range(1000)) \
    .map(lambda x: (x % 50, x * 3)) \
    .filter(lambda x: x[1] > 56) \
    .reduceByKey(lambda x, y: x + y)

## 要連續對myRDD做不同事前，先cache 

In [53]:
myRDD.persist()

PythonRDD[116] at RDD at PythonRDD.scala:48

In [54]:
myRDD.count()

50

In [55]:
myRDD.take(5)

[(0, 28500), (32, 30420), (48, 31380), (8, 28956), (16, 29412)]

In [56]:
myRDD.values().reduce(lambda x, y: x + y)

1497987

## 不要用了再unpersist

In [57]:
myRDD.unpersist()

PythonRDD[116] at RDD at PythonRDD.scala:48

# 6.  Text IO

##  textFile

In [58]:
lines = sc.textFile("../dataset/news.txt")

In [59]:
lines.take(1)

[u'Google, which not along ago was using artificial intelligence to identify cat pictures, has moved onto something bigger -- breast cancer.']

In [60]:
lines.count()

12

In [61]:
new_text = lines.filter(lambda line: 'Google' in line)

In [62]:
new_text.count()

9

## saveAsTextFile

In [64]:
new_text.saveAsTextFile("../outputs/news")

In [66]:
sc.textFile("../outputs/news").count()

9