## RDD Basics

In [38]:
// in Scala: converts a Dataset[Long] to RDD[Long]
spark.range(500).rdd

res33: org.apache.spark.rdd.RDD[Long] = MapPartitionsRDD[62] at rdd at <console>:28


In [39]:
// in Scala
spark.range(10).toDF().rdd.map(rowObject => rowObject.getLong(0))


res34: org.apache.spark.rdd.RDD[Long] = MapPartitionsRDD[69] at map at <console>:28


In [40]:
// in Scala
spark.range(10).rdd.toDF()


res35: org.apache.spark.sql.DataFrame = [value: bigint]


In [42]:
// in Scala
val myCollection = "Spark: Big Data Processing Made Simple"
  .split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)


myCollection: Array[String] = Array(Spark:, Big, Data, Processing, Made, Simple)
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[77] at parallelize at <console>:30


In [44]:
// in Scala
words.setName("myWords")
words.name // myWords

res37: String = myWords


In [48]:
//spark.sparkContext.textFile("/some/path/withTextFiles")

In [49]:
//spark.sparkContext.wholeTextFiles("/some/path/withTextFiles")

In [47]:
words.distinct().count()

res40: Long = 6


In [50]:
// in Scala
def startsWithS(individual:String) = {
  individual.startsWith("S")
}

startsWithS: (individual: String)Boolean


In [52]:
// in Scala
words.filter(word => startsWithS(word)).collect()

res44: Array[String] = Array(Spark:, Simple)


In [53]:
// in Scala
val words2 = words.map(word => (word, word(0), word.startsWith("S")))

words2: org.apache.spark.rdd.RDD[(String, Char, Boolean)] = MapPartitionsRDD[87] at map at <console>:28


In [54]:
// in Scala
words2.filter(record => record._3).take(5)

res45: Array[(String, Char, Boolean)] = Array((Spark:,S,true), (Simple,S,true))


In [55]:
// in Scala
words.flatMap(word => word.toSeq).take(5)

res46: Array[Char] = Array(S, p, a, r, k)


In [56]:
// in Scala
words.sortBy(word => word.length() * -1).take(2)

res47: Array[String] = Array(Processing, Spark:)


In [57]:
// in Scala
val fiftyFiftySplit = words.randomSplit(Array[Double](0.5, 0.5))

fiftyFiftySplit: Array[org.apache.spark.rdd.RDD[String]] = Array(MapPartitionsRDD[95] at randomSplit at <console>:28, MapPartitionsRDD[96] at randomSplit at <console>:28)


In [58]:
// in Scala
spark.sparkContext.parallelize(1 to 20).reduce(_ + _) // 210

res48: Int = 210


In [59]:
// in Scala
def wordLengthReducer(leftWord:String, rightWord:String): String = {
  if (leftWord.length > rightWord.length)
    return leftWord
  else
    return rightWord
}

words.reduce(wordLengthReducer)

wordLengthReducer: (leftWord: String, rightWord: String)String
res49: String = Processing


In [60]:
words.count()


res50: Long = 6


In [61]:
val confidence = 0.95
val timeoutMilliseconds = 400
words.countApprox(timeoutMilliseconds, confidence)


confidence: Double = 0.95
timeoutMilliseconds: Int = 400
res51: org.apache.spark.partial.PartialResult[org.apache.spark.partial.BoundedDouble] = (final: [6.000, 6.000])


In [62]:
words.countApproxDistinct(0.05)


res52: Long = 6


In [63]:
words.countApproxDistinct(4, 10)


res53: Long = 6


In [64]:
words.countByValue()


res54: scala.collection.Map[String,Long] = Map(Simple -> 1, Processing -> 1, Spark: -> 1, Made -> 1, Big -> 1, Data -> 1)


In [65]:
words.countByValueApprox(1000, 0.95)


res55: org.apache.spark.partial.PartialResult[scala.collection.Map[String,org.apache.spark.partial.BoundedDouble]] = (final: Map(Simple -> [1.000, 1.000], Processing -> [1.000, 1.000], Spark: -> [1.000, 1.000], Made -> [1.000, 1.000], Big -> [1.000, 1.000], Data -> [1.000, 1.000]))


In [66]:
words.first()


res56: String = Spark:


In [67]:
spark.sparkContext.parallelize(1 to 20).max()
spark.sparkContext.parallelize(1 to 20).min()


res57: Int = 1


In [68]:
words.take(5)
words.takeOrdered(5)
words.top(5)
val withReplacement = true
val numberToTake = 6
val randomSeed = 100L
words.takeSample(withReplacement, numberToTake, randomSeed)


withReplacement: Boolean = true
numberToTake: Int = 6
randomSeed: Long = 100
res58: Array[String] = Array(Big, Simple, Made, Big, Made, Big)


In [73]:
words.saveAsTextFile("file:/tmp/SparkmyFile")

In [74]:
// in Scala
import org.apache.hadoop.io.compress.BZip2Codec
words.saveAsTextFile("file:/tmp/SparkCompressed", classOf[BZip2Codec])

import org.apache.hadoop.io.compress.BZip2Codec


In [76]:
words.saveAsObjectFile("/tmp/sequenceFilePath")

In [77]:
words.cache()

res67: words.type = myWords ParallelCollectionRDD[77] at parallelize at <console>:30


In [78]:
// in Scala
words.getStorageLevel

res68: org.apache.spark.storage.StorageLevel = StorageLevel(memory, deserialized, 1 replicas)


In [79]:
spark.sparkContext.setCheckpointDir("/tmp/checkpointing")
words.checkpoint()

In [81]:
words.pipe("wc -l").collect()

res71: Array[String] = Array(3, 3)


In [82]:
// in Scala
words.mapPartitions(part => Iterator[Int](1)).sum() // 2

res72: Double = 2.0


In [83]:
// in Scala
def indexedFunc(partitionIndex:Int, withinPartIterator: Iterator[String]) = {
  withinPartIterator.toList.map(
    value => s"Partition: $partitionIndex => $value").iterator
}
words.mapPartitionsWithIndex(indexedFunc).collect()

indexedFunc: (partitionIndex: Int, withinPartIterator: Iterator[String])Iterator[String]
res73: Array[String] = Array(Partition: 0 => Spark:, Partition: 0 => Big, Partition: 0 => Data, Partition: 1 => Processing, Partition: 1 => Made, Partition: 1 => Simple)


In [84]:
words.foreachPartition { iter =>
  import java.io._
  import scala.util.Random
  val randomFileName = new Random().nextInt()
  val pw = new PrintWriter(new File(s"/tmp/random-file-${randomFileName}.txt"))
  while (iter.hasNext) {
      pw.write(iter.next())
  }
  pw.close()
}


In [85]:
// in Scala
spark.sparkContext.parallelize(Seq("Hello", "World"), 2).glom().collect()
// Array(Array(Hello), Array(World))

res75: Array[Array[String]] = Array(Array(Hello), Array(World))
