# Feature scaling

The majority of machine learning and optimization algorithms behave much better if features are on the same scale. Using standardization, we center the feature columns at mean 0 with standard deviation 1. Normalization refers to the rescaling of the features to a range of [0, 1].

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.StandardScaler
//import org.apache.spark.ml.feature.MinMaxScaler

## Load the dataset

In [2]:
val df = spark.read.
    format("csv").
    option("header", "true").
    option("inferschema", "true").
    option("delimiter",";").
    load("../Datasets/Winequality_red.csv")

df = [fixed acidity: double, volatile acidity: double ... 10 more fields]


[fixed acidity: double, volatile acidity: double ... 10 more fields]

## Explore the dataset

In [3]:
df.show(10)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
|         11.2|            0.28|       0.56|           1.9|    0.075|               17.0|           

In [4]:
df.printSchema

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+-------------------+-------------------+------------------+--------------------+-------------------+--------------------+--------------------+-------------------+------------------+------------------+------------------+
|summary|     fixed acidity|   volatile acidity|        citric acid|    residual sugar|           chlorides|free sulfur dioxide|total sulfur dioxide|             density|                 pH|         sulphates|           alcohol|           quality|
+-------+------------------+-------------------+-------------------+------------------+--------------------+-------------------+--------------------+--------------------+-------------------+------------------+------------------+------------------+
|  count|              1599|               1599|               1599|              1599|                1599|               1599|                1599|                1599|               1599|              1599|              1599|              1599|
|   mean

In [6]:
// df.select(df.columns.map(name => mean(name)): _*).show()
for( name <- df.columns) df.select(mean(name)).show()

+------------------+
|avg(fixed acidity)|
+------------------+
| 8.319637273295838|
+------------------+

+---------------------+
|avg(volatile acidity)|
+---------------------+
|   0.5278205128205131|
+---------------------+

+------------------+
|  avg(citric acid)|
+------------------+
|0.2709756097560964|
+------------------+

+-------------------+
|avg(residual sugar)|
+-------------------+
| 2.5388055034396517|
+-------------------+

+-------------------+
|     avg(chlorides)|
+-------------------+
|0.08746654158849257|
+-------------------+

+------------------------+
|avg(free sulfur dioxide)|
+------------------------+
|      15.874921826141339|
+------------------------+

+-------------------------+
|avg(total sulfur dioxide)|
+-------------------------+
|        46.46779237023139|
+-------------------------+

+------------------+
|      avg(density)|
+------------------+
|0.9967466791744831|
+------------------+

+-----------------+
|          avg(pH)|
+-----------------+
|3

## Vector Assembler

In [7]:
df.columns

[fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]

In [8]:
df.columns.length

12

In [9]:
df.columns.slice(1, 11)

[volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol]

In [10]:
val assembler = new VectorAssembler().
    setInputCols(df.columns.slice(1, 11)).//setInputCols(Array("_c1", "_c2")).
    setOutputCol("features")

assembler = vecAssembler_cba479c8d01b


vecAssembler_cba479c8d01b

In [11]:
val df_v = assembler.transform(df).select(col("quality").cast("Double").as("label"), col("features"))
df_v.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  5.0|[0.7,0.0,1.9,0.07...|
|  5.0|[0.88,0.0,2.6,0.0...|
|  5.0|[0.76,0.04,2.3,0....|
|  6.0|[0.28,0.56,1.9,0....|
|  5.0|[0.7,0.0,1.9,0.07...|
|  5.0|[0.66,0.0,1.8,0.0...|
|  5.0|[0.6,0.06,1.6,0.0...|
|  7.0|[0.65,0.0,1.2,0.0...|
|  7.0|[0.58,0.02,2.0,0....|
|  5.0|[0.5,0.36,6.1,0.0...|
|  5.0|[0.58,0.08,1.8,0....|
|  5.0|[0.5,0.36,6.1,0.0...|
|  5.0|[0.615,0.0,1.6,0....|
|  5.0|[0.61,0.29,1.6,0....|
|  5.0|[0.62,0.18,3.8,0....|
|  5.0|[0.62,0.19,3.9,0....|
|  7.0|[0.28,0.56,1.8,0....|
|  5.0|[0.56,0.28,1.7,0....|
|  4.0|[0.59,0.08,4.4,0....|
|  6.0|[0.32,0.51,1.8,0....|
+-----+--------------------+
only showing top 20 rows



df_v = [label: double, features: vector]


[label: double, features: vector]

## StandardScaler

In [12]:
val s_scaler = new StandardScaler().
  setInputCol("features").
  setOutputCol("scaledFeatures")
  //setWithMean(true)

s_scaler = stdScal_440f23282df2


stdScal_440f23282df2

In [13]:
val df_s = s_scaler.fit(df_v).transform(df_v)
df_s.show()

+-----+--------------------+--------------------+
|label|            features|      scaledFeatures|
+-----+--------------------+--------------------+
|  5.0|[0.7,0.0,1.9,0.07...|[3.90931060290249...|
|  5.0|[0.88,0.0,2.6,0.0...|[4.91456190079171...|
|  5.0|[0.76,0.04,2.3,0....|[4.24439436886557...|
|  6.0|[0.28,0.56,1.9,0....|[1.56372424116099...|
|  5.0|[0.7,0.0,1.9,0.07...|[3.90931060290249...|
|  5.0|[0.66,0.0,1.8,0.0...|[3.68592142559378...|
|  5.0|[0.6,0.06,1.6,0.0...|[3.35083765963071...|
|  7.0|[0.65,0.0,1.2,0.0...|[3.63007413126660...|
|  7.0|[0.58,0.02,2.0,0....|[3.23914307097635...|
|  5.0|[0.5,0.36,6.1,0.0...|[2.79236471635892...|
|  5.0|[0.58,0.08,1.8,0....|[3.23914307097635...|
|  5.0|[0.5,0.36,6.1,0.0...|[2.79236471635892...|
|  5.0|[0.615,0.0,1.6,0....|[3.43460860112148...|
|  5.0|[0.61,0.29,1.6,0....|[3.40668495395789...|
|  5.0|[0.62,0.18,3.8,0....|[3.46253224828507...|
|  5.0|[0.62,0.19,3.9,0....|[3.46253224828507...|
|  7.0|[0.28,0.56,1.8,0....|[1.56372424116099...|


df_s = [label: double, features: vector ... 1 more field]


[label: double, features: vector ... 1 more field]