# Label Encoding

Encoding means mapping class labels to integer values. Note that class labels are not ordinal, therefore it doesn't matter which integer number we assign to a particular label.

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.StringIndexer

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Iris.csv")

df = [sepal_length: double, sepal_width: double ... 3 more fields]


[sepal_length: double, sepal_width: double ... 3 more fields]

## Explore the dataset

In [3]:
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [4]:
df.printSchema

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



## Apply String Indexer

In [5]:
val features = df.columns.slice(0, df.columns.length -1)

features = Array(sepal_length, sepal_width, petal_length, petal_width)


[sepal_length, sepal_width, petal_length, petal_width]

In [6]:
val indexer = new StringIndexer().
  setInputCol("species").
  setOutputCol("label").fit(df)

indexer = strIdx_bcf8317544d3


strIdx_bcf8317544d3

In [7]:
val df_v = indexer.transform(df)
df_v.show()

+------------+-----------+------------+-----------+-------+-----+
|sepal_length|sepal_width|petal_length|petal_width|species|label|
+------------+-----------+------------+-----------+-------+-----+
|         5.1|        3.5|         1.4|        0.2| setosa|  2.0|
|         4.9|        3.0|         1.4|        0.2| setosa|  2.0|
|         4.7|        3.2|         1.3|        0.2| setosa|  2.0|
|         4.6|        3.1|         1.5|        0.2| setosa|  2.0|
|         5.0|        3.6|         1.4|        0.2| setosa|  2.0|
|         5.4|        3.9|         1.7|        0.4| setosa|  2.0|
|         4.6|        3.4|         1.4|        0.3| setosa|  2.0|
|         5.0|        3.4|         1.5|        0.2| setosa|  2.0|
|         4.4|        2.9|         1.4|        0.2| setosa|  2.0|
|         4.9|        3.1|         1.5|        0.1| setosa|  2.0|
|         5.4|        3.7|         1.5|        0.2| setosa|  2.0|
|         4.8|        3.4|         1.6|        0.2| setosa|  2.0|
|         

df_v = [sepal_length: double, sepal_width: double ... 4 more fields]


[sepal_length: double, sepal_width: double ... 4 more fields]

## Labels correspondence

In [8]:
df_v.select("species","label").distinct.show()

|   species|label|
+----------+-----+
|versicolor|  0.0|
|    setosa|  2.0|
| virginica|  1.0|
+----------+-----+

