## 1.基礎轉換操作

In [1]:
val data = sc.textFile("file:///usr/local/spark/README.md")

### map / flatMap / distinct

In [2]:
//map函數會對每一條輸入進行指定的操作，然後為每一條輸入返回一個物件
data.map(x => x.split("\\s+")).collect

Array(Array(#, Apache, Spark), Array(""), Array(Spark, is, a, fast, and, general, cluster, computing, system, for, Big, Data., It, provides), Array(high-level, APIs, in, Scala,, Java,, Python,, and, R,, and, an, optimized, engine, that), Array(supports, general, computation, graphs, for, data, analysis., It, also, supports, a), Array(rich, set, of, higher-level, tools, including, Spark, SQL, for, SQL, and, DataFrames,), Array(MLlib, for, machine, learning,, GraphX, for, graph, processing,), Array(and, Spark, Streaming, for, stream, processing.), Array(""), Array(<http://spark.apache.org/>), Array(""), Array(""), Array(##, Online, Documentation), Array(""), Array(You, can, find, the, latest, Spark, documentation,, including, a, programming), Array(guide,, on,...

In [3]:
//flatMap函數則是兩個操作的集合,正是“先映射後扁平化”：
//操作1：同map函數一樣：對每一條輸入進行指定的操作，然後為每一條輸入返回一個物件
//操作2：最後將所有物件合併為一個物件
data.flatMap(x => x.split("\\s+")).collect

Array(#, Apache, Spark, "", Spark, is, a, fast, and, general, cluster, computing, system, for, Big, Data., It, provides, high-level, APIs, in, Scala,, Java,, Python,, and, R,, and, an, optimized, engine, that, supports, general, computation, graphs, for, data, analysis., It, also, supports, a, rich, set, of, higher-level, tools, including, Spark, SQL, for, SQL, and, DataFrames,, MLlib, for, machine, learning,, GraphX, for, graph, processing,, and, Spark, Streaming, for, stream, processing., "", <http://spark.apache.org/>, "", "", ##, Online, Documentation, "", You, can, find, the, latest, Spark, documentation,, including, a, programming, guide,, on, the, [project, web, page](http://spark.apache.org/documentation.html)., This, README, file, only, contains, basic, se...

In [4]:
//distinct去除重複
data.flatMap(x => x.split("\\s+")).distinct.collect

Array(package, For, Programs, processing., Because, The, page](http://spark.apache.org/documentation.html)., cluster., its, [run, than, APIs, have, Try, computation, through, several, This, graph, Hive, storage, ["Specifying, To, "yarn", Once, prefer, SparkPi, engine, version, file, documentation,, processing,, the, are, systems., params, not, different, refer, Interactive, R,, given., if, build, when, be, Tests, Apache, thread, programs,, including, ./bin/run-example, Spark., package., 1000).count(), Versions, HDFS, Data., >>>, page)., Maven, programming, Testing, module,, Streaming, environment, run:, Developer, clean, 1000:, rich, GraphX, Please, is, guide](http://spark.apache.org/contributing.html), run, URL,, threads., same, MASTER=spark://host:7077, on, buil...

In [5]:
//patitions.size 查看分區數量
data.partitions.size

1

### eoalesce / repartition

In [6]:
//讀取資料時可指定分區數量
val data2 = sc.textFile("file:///usr/local/spark/README.md",4)

In [7]:
data2.partitions.size

4

In [8]:
//coalesce對RDD重新分區
//須指定小於原分區數量，否則分區數量不變，若要大於則指定shuffle參數為true
val rdd1 = data2.coalesce(2)

In [9]:
rdd1.partitions.size

2

### randomSplit / glom

In [10]:
var rdd = sc.makeRDD(1 to 10, 10)

In [11]:
//randomSplit根據權重將原RDD拆分多個RDD
var splitRDD = rdd.randomSplit(Array(1.0,2.0,3.0,4.0))

In [12]:
//randomSplit的結果是個數組
splitRDD.size

4

In [13]:
splitRDD(0).collect

Array(3, 9)

In [14]:
splitRDD(1).collect

Array(1, 7, 10)

In [15]:
splitRDD(3).collect

Array(2, 8)

In [16]:
splitRDD.foreach(println)

MapPartitionsRDD[12] at randomSplit at <console>:22
MapPartitionsRDD[13] at randomSplit at <console>:22
MapPartitionsRDD[14] at randomSplit at <console>:22
MapPartitionsRDD[15] at randomSplit at <console>:22


In [17]:
splitRDD.map(x => x.collect)

Array(Array(3, 9), Array(1, 7, 10), Array(4, 5, 6), Array(2, 8))

In [18]:
var rdd = sc.makeRDD(1 to 10,3)

In [19]:
//glom將RDD中每一個分區的T類型數據轉變為元素類型為T的數組[Array[T]]
rdd.glom().collect

Array(Array(1, 2, 3), Array(4, 5, 6), Array(7, 8, 9, 10))

In [20]:
rdd.glom()

MapPartitionsRDD[18] at glom at <console>:22

### union / intersection / subtract

In [21]:
var rdd1 = sc.makeRDD(1 to 3)

In [22]:
var rdd2 = sc.makeRDD(3 to 5)

In [23]:
//union將兩個RDD合併為一個RDD
rdd1.union(rdd2).collect

Array(1, 2, 3, 3, 4, 5)

In [24]:
//intersection取兩個RDD的交集並去除重複
rdd1.intersection(rdd2).collect

Array(3)

In [25]:
//subtract取RDD的餘集合並不去除重複
rdd1.subtract(rdd2).collect

Array(1, 2)

### mapPartions / mapPartitionsWithIndex

In [26]:
var rdd1 = sc.makeRDD(1 to 5,2)

In [27]:
//使用mapPartitionsWithIndex對rdd1重新進行分區,帶有分區參數
var rdd2 = rdd1.mapPartitionsWithIndex{
    (x,iter) => {
        var result = List[String]()
        var i = 0
        while(iter.hasNext){
            i += iter.next()
        }
        result.::(x + "|" + i).iterator
    }
}

In [28]:
rdd2.collect

Array(0|3, 1|12)

In [29]:
////使用mapPartitions對rdd1重新進行分區
var rdd3 = rdd1.mapPartitions{
    x => {
        var result = List[Int]()
        var i = 0
        while(x.hasNext){
            i += x.next()
        }
        result.::(i).iterator
    }
}

In [30]:
rdd3.collect

Array(3, 12)

### zip / zipPartitions / zipWithIndex/ zipWithUniqueID

In [31]:
var rdd1 = sc.makeRDD(1 to 5,2)

In [32]:
var rdd2 = sc.makeRDD(Seq("A","B","C","D","E"),2)

In [33]:
//zip將兩個RDD組合成key/value型式的RDD，這裡默認partition和元素數量都相同否則拋出異常
rdd1.zip(rdd2).collect

Array((1,A), (2,B), (3,C), (4,D), (5,E))

In [34]:
var rdd3 = sc.makeRDD(Seq("A","B","C","D","E"),3)

In [35]:
//分區數量不同,出現錯誤
rdd1.zip(rdd3).collect

Name: java.lang.IllegalArgumentException
Message: Can't zip RDDs with unequal numbers of partitions: List(2, 3)
StackTrace:   at org.apache.spark.rdd.ZippedPartitionsBaseRDD.getPartitions(ZippedPartitionsRDD.scala:57)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
  at scala.Option.getOrElse(Option.scala:121)
  at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
  at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
  at org.apache.spark.rdd.RDD.collect(RDD.scala:934)

In [36]:
var rdd4 = sc.makeRDD(Seq("A","B","C","D","E","F"),2)

In [37]:
//元素數量不同,出現錯誤
rdd1.zip(rdd4).collect

Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 0 in stage 22.0 failed 1 times, most recent failure: Lost task 0.0 in stage 22.0 (TID 91, localhost, executor driver): org.apache.spark.SparkException: Can only zip RDDs with same number of elements in each partition
	at org.apache.spark.rdd.RDD$$anonfun$zip$1$$anonfun$apply$27$$anon$2.hasNext(RDD.scala:859)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.rdd.RDD$$anonfun$zip$1$$anonfun$apply$27$$anon$2.foreach(RDD.scala:855)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.rdd.RDD$$anonfun$zip$1$$anonfun$apply$27$$anon$2.to(RDD.scala:855)
	at scala.collection.TraversableOnce$class.toBuffer(Tr

In [38]:
var rdd1 = sc.makeRDD(1 to 8,2)

In [39]:
var rdd2 = sc.makeRDD(Seq("A","B","C","D","E"),2)

In [40]:
var rdd3 = sc.makeRDD(Seq("a","b","c","d","e"),2)

In [41]:
//zipPartitions函數將多個RDD按照partition組合成為新的RDD，該函數需要組合的RDD具有相同的分區數，但對於每個分區內的元素數量沒有要求。
var rdd4 = rdd1.zipPartitions(rdd2,rdd3){
    (rdd1Iter,rdd2Iter,rdd3Iter) => {
        var result = List[String]()
        while(rdd1Iter.hasNext && rdd2Iter.hasNext && rdd3Iter.hasNext){
            result::=(rdd1Iter.next() + "_" + rdd2Iter.next() + "_" + rdd3Iter.next())            
        }
        result.iterator
    }    
}

In [42]:
rdd4.collect

Array(2_B_b, 1_A_a, 7_E_e, 6_D_d, 5_C_c)

In [43]:
var rdd1 = sc.makeRDD(Seq("A","B","C","D","E","F"),2)

In [44]:
var rdd2 = sc.makeRDD(Seq("A","B","R","D","F"),2)

In [45]:
//zipWithIndex 將RDD中的元素和這個元素在RDD中的ID(位置)組合成key/value的型態
rdd2.zipWithIndex().collect

Array((A,0), (B,1), (R,2), (D,3), (F,4))

In [46]:
////zipWithUniqueId 將RDD中的元素和該元素的唯一ID(分區+位置)組合成key/value的型態
rdd1.zipWithUniqueId().collect

Array((A,0), (B,2), (C,4), (D,1), (E,3), (F,5))