In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,IndexToString}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Iris.csv")

df = [sepal_length: double, sepal_width: double ... 3 more fields]


[sepal_length: double, sepal_width: double ... 3 more fields]

## Explore the dataset

In [3]:
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [4]:
df.printSchema

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+-------------------+------------------+------------------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|
+-------+------------------+-------------------+------------------+------------------+
|  count|               150|                150|               150|               150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|
|    min|               4.3|                2.0|               1.0|               0.1|
|    max|               7.9|                4.4|               6.9|               2.5|
+-------+------------------+-------------------+------------------+------------------+



In [6]:
df.select("species").distinct.show()

|   species|
+----------+
| virginica|
|versicolor|
|    setosa|
+----------+



In [7]:
df.groupBy("species").count.show()

|   species|count|
+----------+-----+
| virginica|   50|
|versicolor|   50|
|    setosa|   50|
+----------+-----+



## Apply Vector Assembler and String Indexer

In [8]:
val features = df.columns.slice(0, df.columns.length -1)

features = Array(sepal_length, sepal_width, petal_length, petal_width)


[sepal_length, sepal_width, petal_length, petal_width]

In [9]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_8780061c194d


vecAssembler_8780061c194d

In [10]:
val indexer = new StringIndexer().
  setInputCol("species").
  setOutputCol("label")

indexer = strIdx_ee3dfa9e82b6


strIdx_ee3dfa9e82b6

In [11]:
val pipeline = new Pipeline().
  setStages(Array(assembler, indexer))

pipeline = pipeline_daca55f77a99


pipeline_daca55f77a99

In [12]:
val df_v = pipeline.fit(df).transform(df).select("features","label")
df_v.show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  2.0|
|[4.9,3.0,1.4,0.2]|  2.0|
|[4.7,3.2,1.3,0.2]|  2.0|
|[4.6,3.1,1.5,0.2]|  2.0|
|[5.0,3.6,1.4,0.2]|  2.0|
|[5.4,3.9,1.7,0.4]|  2.0|
|[4.6,3.4,1.4,0.3]|  2.0|
|[5.0,3.4,1.5,0.2]|  2.0|
|[4.4,2.9,1.4,0.2]|  2.0|
|[4.9,3.1,1.5,0.1]|  2.0|
|[5.4,3.7,1.5,0.2]|  2.0|
|[4.8,3.4,1.6,0.2]|  2.0|
|[4.8,3.0,1.4,0.1]|  2.0|
|[4.3,3.0,1.1,0.1]|  2.0|
|[5.8,4.0,1.2,0.2]|  2.0|
|[5.7,4.4,1.5,0.4]|  2.0|
|[5.4,3.9,1.3,0.4]|  2.0|
|[5.1,3.5,1.4,0.3]|  2.0|
|[5.7,3.8,1.7,0.3]|  2.0|
|[5.1,3.8,1.5,0.3]|  2.0|
+-----------------+-----+
only showing top 20 rows



df_v = [features: vector, label: double]


[features: vector, label: double]

## Split into train and test sets

In [13]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [features: vector, label: double]
testData = [features: vector, label: double]


[features: vector, label: double]

## Train the model

In [14]:
val rf = new RandomForestClassifier().setNumTrees(10)
val model = rf.fit(trainingData)

rf = rfc_499c39faf925
model = RandomForestClassificationModel (uid=rfc_237155a014dd) with 10 trees


RandomForestClassificationModel (uid=rfc_237155a014dd) with 10 trees

## Evaluate the model

In [15]:
val predictions = model.transform(testData)
predictions.show(10)

+-----------------+-----+--------------+-------------+----------+
|         features|label| rawPrediction|  probability|prediction|
+-----------------+-----+--------------+-------------+----------+
|[4.6,3.2,1.4,0.2]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
|[4.6,3.4,1.4,0.3]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
|[4.7,3.2,1.3,0.2]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
|[4.8,3.1,1.6,0.2]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
|[4.9,3.0,1.4,0.2]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
|[5.0,2.0,3.5,1.0]|  0.0| [8.0,2.0,0.0]|[0.8,0.2,0.0]|       0.0|
|[5.0,3.5,1.3,0.3]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
|[5.0,3.5,1.6,0.6]|  2.0| [6.0,0.0,4.0]|[0.6,0.0,0.4]|       0.0|
|[5.1,3.4,1.5,0.2]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
|[5.1,3.7,1.5,0.4]|  2.0|[0.0,0.0,10.0]|[0.0,0.0,1.0]|       2.0|
+-----------------+-----+--------------+-------------+----------+
only showing top 10 rows



predictions = [features: vector, label: double ... 3 more fields]


[features: vector, label: double ... 3 more fields]

In [16]:
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy")

evaluator = mcEval_df9538a69549


mcEval_df9538a69549

In [17]:
val accuracy = evaluator.evaluate(predictions)
accuracy

accuracy = 0.9302325581395349


0.9302325581395349

## Feature importance

In [18]:
val fi = List.range(0,4).map(i => (df.columns(i), model.featureImportances.toArray(i)) ).toDF("feature", "importance")
fi.sort(col("importance").desc).show()

+------------+--------------------+
|     feature|          importance|
+------------+--------------------+
| petal_width|  0.6084215661151628|
|petal_length| 0.36578068558163007|
| sepal_width|0.014329986823818152|
|sepal_length| 0.01146776147938885|
+------------+--------------------+



fi = [feature: string, importance: double]


[feature: string, importance: double]

## Select features

In [19]:
val assembler = new VectorAssembler().
  setInputCols(Array("petal_width", "petal_length")).
  setOutputCol("features")

assembler = vecAssembler_34e748d5a929


vecAssembler_34e748d5a929

In [20]:
val df_vs = assembler.transform(df).select(col("features"),col("species"))
df_vs.show(10)

+---------+-------+
| features|species|
+---------+-------+
|[0.2,1.4]| setosa|
|[0.2,1.4]| setosa|
|[0.2,1.3]| setosa|
|[0.2,1.5]| setosa|
|[0.2,1.4]| setosa|
|[0.4,1.7]| setosa|
|[0.3,1.4]| setosa|
|[0.2,1.5]| setosa|
|[0.2,1.4]| setosa|
|[0.1,1.5]| setosa|
+---------+-------+
only showing top 10 rows



df_vs = [features: vector, species: string]


[features: vector, species: string]