# Initialization

In [1]:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

In [2]:
val conf = new SparkConf().setAppName("scalaSparkApp")
val sc = new SparkContext(conf)

conf = org.apache.spark.SparkConf@4cd79233
sc = org.apache.spark.SparkContext@5fe4a554


org.apache.spark.SparkContext@5fe4a554

# Use Case I : Words Count
We will check what hadoop file system tracks

In [29]:
// tokenizing our text and deleting empty lines
val wordsAll = sc.textFile("hdfs://node-master:9000/user/root/alice_in_wonderland.txt").flatMap(line => line.split(" "))
val words = wordsAll.filter{_.size > 0}

// counting words by applying a map and reduce operations
val wordsCount = words.map(word => (word, 1)).reduceByKey(_+_)

wordsAll = MapPartitionsRDD[41] at flatMap at <console>:32
words = MapPartitionsRDD[42] at filter at <console>:33
wordsCount = ShuffledRDD[44] at reduceByKey at <console>:36


ShuffledRDD[44] at reduceByKey at <console>:36

In [30]:
wordsCount.collect.foreach(println)

(a--',1)
(MINE.',1)
(someone,1)
(housemaid,',1)
(somebody,,1)
(bone,1)
(roses.,1)
(line:,1)
(mouse!'),1)
(order,2)
(tone.,9)
(instead!',1)
(passage,,2)
(said;,2)
(behind,12)
(it--once,1)
(pigeon,1)
(wasn't,11)
(been,36)
(Dodo,9)
(instantly,,2)
(`Alice!',1)
(Lory,3)
(`crumbs,1)
(squeeze,1)
(appealed,1)
(she's,3)
(crying,2)
(begun.',1)
(SWIM--",1)
(knows,2)
(Caucus-Race,1)
(Queen,,17)
(PLEASE,1)
(words.',1)
(English);,1)
(est,1)
(dive,1)
(`Talking,1)
(HEARTHRUG,,1)
(are,35)
(one,,6)
(slates,1)
(interesting.,1)
(shower,2)
(yourself.',2)
(IS,8)
(shut,4)
(newspapers,,1)
(pretend,1)
(we're,1)
(Why,,7)
(grant,1)
(Kings,1)
(tea,,1)
(live.,1)
(people!,1)
(ask.',1)
(yet,,3)
(Oh!,1)
(throne,1)
(dead,4)
(Exactly,1)
(directly,,1)
(what's,2)
(last:,1)
(that,',9)
(despair,1)
(Normans--",1)
(mad,,1)
(execution.,1)
(them,49)
(moment's,2)
(you,',6)
(chimneys,1)
(Rome--no,,1)
(drew,5)
(sour--and,1)
(Owl,2)
(lived,3)
(`It's--it's,1)
(air.,5)
(blew,2)
(sharp,6)
(belongs,2)
(like,',3)
(on.',2)
(nursing,3)
(

In [32]:
val mostCommonWords = wordsCount.map(item => item.swap).sortByKey(false)
mostCommonWords.take(10)

mostCommonWords = ShuffledRDD[52] at sortByKey at <console>:29


Array((1507,the), (714,and), (703,to), (606,a), (490,of), (484,she), (416,said), (346,it), (345,in), (328,was))

# Use Case II : Classification Iris Dataset

In [14]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.SQLContext

In [7]:
val sqlContext = new SQLContext(sc)

sqlContext = org.apache.spark.sql.SQLContext@3df29710




org.apache.spark.sql.SQLContext@3df29710

In [19]:
val dfBase = sqlContext.read.format("csv").option("header", "true").load("hdfs://node-master:9000/user/root/iris.csv")

dfBase = [sepal_length: string, sepal_width: string ... 3 more fields]


[sepal_length: string, sepal_width: string ... 3 more fields]

In [20]:
dfBase.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|         .2| Setosa|
|         4.9|          3|         1.4|         .2| Setosa|
|         4.7|        3.2|         1.3|         .2| Setosa|
|         4.6|        3.1|         1.5|         .2| Setosa|
|           5|        3.6|         1.4|         .2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [21]:
dfBase.printSchema()

root
 |-- sepal_length: string (nullable = true)
 |-- sepal_width: string (nullable = true)
 |-- petal_length: string (nullable = true)
 |-- petal_width: string (nullable = true)
 |-- variety: string (nullable = true)



In [23]:
val df = dfBase.withColumn("slength", col("sepal_length").cast(FloatType))
          .drop("sepal_length")
          .withColumnRenamed("slength", "sepal_length")
          .withColumn("plength", col("petal_length").cast(FloatType))
          .drop("petal_length")
          .withColumnRenamed("plength", "petal_length")
          .withColumn("swidth", col("sepal_width").cast(FloatType))
          .drop("sepal_width")
          .withColumnRenamed("swidth", "sepal_width")
          .withColumn("pwidth", col("petal_width").cast(FloatType))
          .drop("petal_width")
          .withColumnRenamed("pwidth", "petal_width")

df = [variety: string, sepal_length: float ... 3 more fields]


[variety: string, sepal_length: float ... 3 more fields]

In [27]:
val indexer = new StringIndexer().setInputCol("variety").setOutputCol("variety_label").fit(df)

indexer = strIdx_668af7ee5b3c


strIdx_668af7ee5b3c

In [33]:
val assembler = new VectorAssembler()
                    .setInputCols(Array("sepal_length", "petal_length", "sepal_width", "petal_width"))
                    .setOutputCol("features")

assembler = vecAssembler_25e0b94469b9


vecAssembler_25e0b94469b9

In [34]:
val dt = new DecisionTreeClassifier().setLabelCol("variety_label").setFeaturesCol("features")

dt = dtc_43682c489ae5


dtc_43682c489ae5

In [40]:
val Array(trainingData, testData) = df.randomSplit(Array(0.7, 0.3))

trainingData = [variety: string, sepal_length: float ... 3 more fields]
testData = [variety: string, sepal_length: float ... 3 more fields]


[variety: string, sepal_length: float ... 3 more fields]

In [42]:
val pipeline = new Pipeline().setStages(Array(indexer, assembler, dt))

pipeline = pipeline_2f39957e7f40


pipeline_2f39957e7f40

In [43]:
val model = pipeline.fit(trainingData)

model = pipeline_2f39957e7f40


pipeline_2f39957e7f40

In [44]:
val predictions = model.transform(testData)

predictions = [variety: string, sepal_length: float ... 8 more fields]


[variety: string, sepal_length: float ... 8 more fields]

In [45]:
predictions.select("prediction", "variety_label").show(5)

+----------+-------------+
|prediction|variety_label|
+----------+-------------+
|       2.0|          2.0|
|       2.0|          2.0|
|       2.0|          2.0|
|       2.0|          2.0|
|       2.0|          2.0|
+----------+-------------+
only showing top 5 rows



In [46]:
val evaluator = new MulticlassClassificationEvaluator()
                    .setLabelCol("variety_label")
                    .setPredictionCol("prediction")
                    .setMetricName("accuracy")

evaluator = mcEval_dd8b23268398


mcEval_dd8b23268398

In [50]:
val accuracy = evaluator.evaluate(predictions)

accuracy = 0.9361702127659575


0.9361702127659575

In [62]:
println(f"Test Accuracy = ${accuracy*100}%.2f%%")
println(f"Test Error    = ${(1 - accuracy)*100}%.2f%%")

Test Accuracy = 93.62%
Test Error    = 6.38%
