# RDD Transform with Scala

## 2.1.2 SparkContext 생성 

In [4]:
import org.apache.spark.{SparkContext, SparkConf}

val conf = new SparkConf().setMaster("local[*]").setAppName("RDDCreateSample")
val sc = new SparkContext(conf)

Name: org.apache.spark.SparkException
Message: Only one SparkContext may be running in this JVM (see SPARK-2243). To ignore this error, set spark.driver.allowMultipleContexts = true. The currently running SparkContext was created at:
org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:901)
org.apache.toree.kernel.api.Kernel.createSparkContext(Kernel.scala:349)
org.apache.toree.kernel.api.Kernel.createSparkContext(Kernel.scala:368)
org.apache.toree.boot.layer.StandardComponentInitialization$class.initializeSparkContext(ComponentInitialization.scala:103)
org.apache.toree.Main$$anon$1.initializeSparkContext(Main.scala:35)
org.apache.toree.boot.layer.StandardComponentInitialization$class.initializeComponents(ComponentInitialization.scala:88)
org.apache.toree.Main$$anon$1.initializeComponents(Main.scala:35)
org.apache.toree.boot.KernelBootstrap.initialize(KernelBootstrap.scala:101)
org.apache.toree.Main$.delayedEndpoint$org$apache$toree$Main$1(Main.scala:40)
org.apache.

In [6]:
val rdd = sc.parallelize(List("a","b","c","d"))

In [8]:
rdd.collect

Array(a, b, c, d)

In [9]:
val rdd = sc.parallelize(1 to 10)
val result = rdd.collect
println(result.mkString(", "))

1, 2, 3, 4, 5, 6, 7, 8, 9, 10


#### 2.1.4.2 Count

In [18]:
val rdd = sc.parallelize(1 to 10)
val result = rdd.count
println(result)

10


### 2.1.5 Transformation

#### 2.1.5.1 Map

In [20]:
val rdd = sc.parallelize(1 to 10)
val result = rdd.map(_ + 1)
println(result.collect.mkString(", "))

2, 3, 4, 5, 6, 7, 8, 9, 10, 11


#### 2.1.5.2 flatMap

In [26]:
val fruits = List("apple, orange","grape, apple, mango", "blueberry,tomato, orange")
val rdd1 = sc.parallelize(fruits)
val rdd2 = rdd1.map(_.split(","))

In [28]:
println(rdd2.collect().map(_.mkString("{",",","}")).mkString("{",",","}"))

{{apple, orange},{grape, apple, mango},{blueberry,tomato, orange}}


In [29]:
val fruits = List("apple, orange","grape, apple, mango", "blueberry,tomato, orange")
val rdd1 = sc.parallelize(fruits)
val rdd2 = rdd1.flatMap(_.split(","))
print(rdd2.collect.mkString(", "))

apple,  orange, grape,  apple,  mango, blueberry, tomato,  orange

#### Apple 단어를 포함한 것들만 하고 싶다면 

In [38]:
val fruits = List("apple, orange","grape, apple, mango", "blueberry,tomato, orange")
val rdd1 = sc.parallelize(fruits)
val rdd2 = rdd1.flatMap(log => {
    if (log.contains("apple")){
 
    }else{
        None
    }

})

Name: Unknown Error
Message: <console>:25: error: type mismatch;
 found   : Unit
 required: TraversableOnce[?]
           if (log.contains("apple")){
                                     ^
StackTrace: 

#### 2.1.5.3 mapPartitions
 - map(), flatMap()의 경우 각 요소를 하나씩 처리 
 - mapPartitions는 Parition별로 처리 

In [39]:
val rdd1 = sc.parallelize(1 to 10, 3)
val rdd2 = rdd1.mapPartitions(numbers => {
  print("DB 연결")
  numbers.map{number => number + 1}
})

println(rdd2.collect.mkString(","))

2,3,4,5,6,7,8,9,10,11
