# Feature scaling
The majority of machine learning and optimization algorithms behave much better if features are on the same scale.
Using standardization, we center the feature columns at mean 0 with standard deviation 1.
Normalization refers to the rescaling of the features to a range of [0, 1].

In [1]:
import pandas as pd
from sklearn.preprocessing import scale, StandardScaler, MinMaxScaler 

## Load the dataset

In [2]:
df = pd.read_csv('../Datasets/Winequality_red.csv', sep=';')

## Explore the dataset

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [5]:
df.mean()

fixed acidity            8.319637
volatile acidity         0.527821
citric acid              0.270976
residual sugar           2.538806
chlorides                0.087467
free sulfur dioxide     15.874922
total sulfur dioxide    46.467792
density                  0.996747
pH                       3.311113
sulphates                0.658149
alcohol                 10.422983
quality                  5.636023
dtype: float64

In [6]:
df.std()

fixed acidity            1.741096
volatile acidity         0.179060
citric acid              0.194801
residual sugar           1.409928
chlorides                0.047065
free sulfur dioxide     10.460157
total sulfur dioxide    32.895324
density                  0.001887
pH                       0.154386
sulphates                0.169507
alcohol                  1.065668
quality                  0.807569
dtype: float64

In [7]:
df.min()

fixed acidity           4.60000
volatile acidity        0.12000
citric acid             0.00000
residual sugar          0.90000
chlorides               0.01200
free sulfur dioxide     1.00000
total sulfur dioxide    6.00000
density                 0.99007
pH                      2.74000
sulphates               0.33000
alcohol                 8.40000
quality                 3.00000
dtype: float64

In [8]:
df.max()

fixed acidity            15.90000
volatile acidity          1.58000
citric acid               1.00000
residual sugar           15.50000
chlorides                 0.61100
free sulfur dioxide      72.00000
total sulfur dioxide    289.00000
density                   1.00369
pH                        4.01000
sulphates                 2.00000
alcohol                  14.90000
quality                   8.00000
dtype: float64

# scale
It centers the data to the mean and scales to unit variance.

In [9]:
x = scale(df)

In [10]:
x.mean(axis=0).astype(int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [11]:
x.std(axis=0)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

# StandardScaler
Standardize features by removing the mean and scaling to unit variance

In [12]:
scaler = StandardScaler()
x1 = scaler.fit_transform(df)

In [13]:
x1.mean(axis=0).astype(int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [14]:
x1.std(axis=0)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [15]:
x1_inverse = scaler.inverse_transform(x1)

In [16]:
x1_inverse.mean(axis=0)

array([  8.31963727,   0.52782051,   0.27097561,   2.5388055 ,
         0.08746654,  15.87492183,  46.46779237,   0.99674668,
         3.3111132 ,   0.65814884,  10.42298311,   5.63602251])

In [17]:
x1_inverse.std(axis=0)

array([  1.74055180e+00,   1.79003704e-01,   1.94740214e-01,
         1.40948711e+00,   4.70505826e-02,   1.04568856e+01,
         3.28850367e+01,   1.88674370e-03,   1.54338181e-01,
         1.69453967e-01,   1.06533430e+00,   8.07316877e-01])

# MinMaxScaler
This estimator scales and translates each feature such
that it is in the range zero and one.

In [18]:
mmscaler = MinMaxScaler()
x2 = mmscaler.fit_transform(df)

In [19]:
x2.mean(axis=0)

array([ 0.32917144,  0.27932912,  0.27097561,  0.11224695,  0.12598755,
        0.20950594,  0.14299573,  0.49021139,  0.44969543,  0.19649631,
        0.31122817,  0.5272045 ])

In [20]:
x2.std(axis=0)

array([ 0.15403113,  0.12260528,  0.19474021,  0.09654021,  0.07854855,
        0.14728008,  0.11620154,  0.13852744,  0.12152613,  0.10146944,
        0.16389758,  0.16146338])

In [21]:
x2.min(axis=0)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [22]:
x2.max(axis=0)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [23]:
x2_inverse = mmscaler.inverse_transform(x2)

In [24]:
x2_inverse.min(axis=0)

array([ 4.6    ,  0.12   ,  0.     ,  0.9    ,  0.012  ,  1.     ,
        6.     ,  0.99007,  2.74   ,  0.33   ,  8.4    ,  3.     ])

In [25]:
x2_inverse.max(axis=0)

array([  15.9    ,    1.58   ,    1.     ,   15.5    ,    0.611  ,
         72.     ,  289.     ,    1.00369,    4.01   ,    2.     ,
         14.9    ,    8.     ])