In [35]:
import numpy as np
import pandas as pd
import scipy.stats as ss
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler,MaxAbsScaler,Normalizer

In [3]:
data_set = load_boston()
data = pd.DataFrame(data_set['data'],columns=data_set['feature_names'])
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


## Standardizing the features
Standardization is the process of centering the variable at zero and standardizing the variance to 1.

In [4]:
scaler = StandardScaler()
scaler.fit(data)

StandardScaler()

In [5]:
scaler.scale_

array([8.59304135e+00, 2.32993957e+01, 6.85357058e+00, 2.53742935e-01,
       1.15763115e-01, 7.01922514e-01, 2.81210326e+01, 2.10362836e+00,
       8.69865112e+00, 1.68370495e+02, 2.16280519e+00, 9.12046075e+01,
       7.13400164e+00])

In [6]:
scaler.transform(data)

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

## Performing mean normalization
In mean normalization, we center the variable at zero and rescale the distribution to the value range. This procedure
involves subtracting the mean from each observation and then dividing the result by the difference between the minimum
and maximum values.

In [7]:
def mean_normalizer(dataframe):
    norm_data = pd.DataFrame()
    for col in dataframe:
        col_mean = np.mean(dataframe[col])
        col_min = np.min(dataframe[col])
        col_max = np.max(dataframe[col])
        norm_data[col] = (dataframe[col] - col_mean)/(col_max - col_min)

    return norm_data

In [8]:
mean_normalizer(data)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.040544,0.066364,-0.323562,-0.06917,-0.034352,0.055636,-0.034757,0.026822,-0.371713,-0.214193,-0.335695,0.101432,-0.211729
1,-0.040308,-0.113636,-0.149075,-0.06917,-0.176327,0.026129,0.106335,0.106581,-0.328235,-0.317246,-0.069738,0.101432,-0.096939
2,-0.040308,-0.113636,-0.149075,-0.06917,-0.176327,0.172517,-0.076981,0.106581,-0.328235,-0.317246,-0.069738,0.091169,-0.237943
3,-0.040251,-0.113636,-0.328328,-0.06917,-0.198961,0.136686,-0.234551,0.206163,-0.284757,-0.355414,0.026007,0.095708,-0.268021
4,-0.039839,-0.113636,-0.328328,-0.06917,-0.198961,0.165236,-0.148042,0.206163,-0.284757,-0.355414,0.026007,0.101432,-0.202071
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,-0.039911,-0.113636,0.029077,-0.06917,0.037664,0.059085,0.005408,-0.119710,-0.371713,-0.258086,0.270688,0.089051,-0.082314
502,-0.040106,-0.113636,0.029077,-0.06917,0.037664,-0.031545,0.083678,-0.137088,-0.371713,-0.258086,0.270688,0.101432,-0.098594
503,-0.039932,-0.113636,0.029077,-0.06917,0.037664,0.132471,0.230948,-0.148000,-0.371713,-0.258086,0.270688,0.101432,-0.193517
504,-0.039383,-0.113636,0.029077,-0.06917,0.037664,0.097598,0.213441,-0.127867,-0.371713,-0.258086,0.270688,0.092733,-0.170338


In [9]:
mean_scaler = StandardScaler(with_mean=True,with_std=False)
mean_scaler.fit(data)
mean_scaler.mean_

array([3.61352356e+00, 1.13636364e+01, 1.11367787e+01, 6.91699605e-02,
       5.54695059e-01, 6.28463439e+00, 6.85749012e+01, 3.79504269e+00,
       9.54940711e+00, 4.08237154e+02, 1.84555336e+01, 3.56674032e+02,
       1.26530632e+01])

In [10]:
mean_scaler.transform(data)

array([[ -3.60720356,   6.63636364,  -8.82677866, ...,  -3.1555336 ,
         40.22596838,  -7.67306324],
       [ -3.58621356, -11.36363636,  -4.06677866, ...,  -0.6555336 ,
         40.22596838,  -3.51306324],
       [ -3.58623356, -11.36363636,  -4.06677866, ...,  -0.6555336 ,
         36.15596838,  -8.62306324],
       ...,
       [ -3.55276356, -11.36363636,   0.79322134, ...,   2.5444664 ,
         40.22596838,  -7.01306324],
       [ -3.50393356, -11.36363636,   0.79322134, ...,   2.5444664 ,
         36.77596838,  -6.17306324],
       [ -3.56611356, -11.36363636,   0.79322134, ...,   2.5444664 ,
         40.22596838,  -4.77306324]])

In [11]:
range_scaler = RobustScaler(with_centering=False,with_scaling=True,quantile_range=(0,100))
range_scaler.fit(data)
range_scaler.scale_

array([8.896988e+01, 1.000000e+02, 2.728000e+01, 1.000000e+00,
       4.860000e-01, 5.219000e+00, 9.710000e+01, 1.099690e+01,
       2.300000e+01, 5.240000e+02, 9.400000e+00, 3.965800e+02,
       3.624000e+01])

In [12]:
range_scaler.transform(data)

array([[7.10352762e-05, 1.80000000e-01, 8.46774194e-02, ...,
        1.62765957e+00, 1.00080690e+00, 1.37417219e-01],
       [3.06957815e-04, 0.00000000e+00, 2.59164223e-01, ...,
        1.89361702e+00, 1.00080690e+00, 2.52207506e-01],
       [3.06733020e-04, 0.00000000e+00, 2.59164223e-01, ...,
        1.89361702e+00, 9.90544153e-01, 1.11203091e-01],
       ...,
       [6.82927750e-04, 0.00000000e+00, 4.37316716e-01, ...,
        2.23404255e+00, 1.00080690e+00, 1.55629139e-01],
       [1.23176518e-03, 0.00000000e+00, 4.37316716e-01, ...,
        2.23404255e+00, 9.92107519e-01, 1.78807947e-01],
       [5.32876969e-04, 0.00000000e+00, 4.37316716e-01, ...,
        2.23404255e+00, 1.00080690e+00, 2.17439294e-01]])

In [13]:
range_scaler.transform(mean_scaler.transform(data))

array([[-0.0405441 ,  0.06636364, -0.32356227, ..., -0.33569506,
         0.10143217, -0.21172912],
       [-0.04030818, -0.11363636, -0.14907546, ..., -0.06973762,
         0.10143217, -0.09693883],
       [-0.0403084 , -0.11363636, -0.14907546, ..., -0.06973762,
         0.09116942, -0.23794325],
       ...,
       [-0.03993221, -0.11363636,  0.02907703, ...,  0.27068792,
         0.10143217, -0.1935172 ],
       [-0.03938337, -0.11363636,  0.02907703, ...,  0.27068792,
         0.09273279, -0.17033839],
       [-0.04008226, -0.11363636,  0.02907703, ...,  0.27068792,
         0.10143217, -0.13170704]])

## Scaling to the maximum and minimum values
Scaling to the minimum and maximum values squeezes the values of the variables between 0 and 1. To implement this
scaling technique, we need to subtract the minimum value from all the observations and divide the result by the
value range.

In [15]:
def minmax_normalizer(dataframe):
    norm_data = pd.DataFrame()
    for col in dataframe:
        col_min = np.min(dataframe[col])
        col_max = np.max(dataframe[col])
        norm_data[col] = (dataframe[col] - col_min)/(col_max - col_min)

    return norm_data

In [16]:
minmax_normalizer(data)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.000000,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.000000,0.208015,0.287234,1.000000,0.089680
1,0.000236,0.00,0.242302,0.0,0.172840,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.000000,0.204470
2,0.000236,0.00,0.242302,0.0,0.172840,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.00,0.063050,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.00,0.063050,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.000000,0.099338
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000633,0.00,0.420455,0.0,0.386831,0.580954,0.681771,0.122671,0.000000,0.164122,0.893617,0.987619,0.219095
502,0.000438,0.00,0.420455,0.0,0.386831,0.490324,0.760041,0.105293,0.000000,0.164122,0.893617,1.000000,0.202815
503,0.000612,0.00,0.420455,0.0,0.386831,0.654340,0.907312,0.094381,0.000000,0.164122,0.893617,1.000000,0.107892
504,0.001161,0.00,0.420455,0.0,0.386831,0.619467,0.889804,0.114514,0.000000,0.164122,0.893617,0.991301,0.131071


In [18]:
scaler = MinMaxScaler()
scaler.fit(data)
scaler.scale_

array([1.12397589e-02, 1.00000000e-02, 3.66568915e-02, 1.00000000e+00,
       2.05761317e+00, 1.91607588e-01, 1.02986612e-02, 9.09347180e-02,
       4.34782609e-02, 1.90839695e-03, 1.06382979e-01, 2.52155933e-03,
       2.75938190e-02])

In [19]:
scaler.transform(data)

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, ...,
        2.87234043e-01, 1.00000000e+00, 8.96799117e-02],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, ...,
        5.53191489e-01, 1.00000000e+00, 2.04470199e-01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, ...,
        5.53191489e-01, 9.89737254e-01, 6.34657837e-02],
       ...,
       [6.11892474e-04, 0.00000000e+00, 4.20454545e-01, ...,
        8.93617021e-01, 1.00000000e+00, 1.07891832e-01],
       [1.16072990e-03, 0.00000000e+00, 4.20454545e-01, ...,
        8.93617021e-01, 9.91300620e-01, 1.31070640e-01],
       [4.61841693e-04, 0.00000000e+00, 4.20454545e-01, ...,
        8.93617021e-01, 1.00000000e+00, 1.69701987e-01]])

## Implementing maximum absolute scaling
Maximum absolute scaling scales the data to its maximum value; that is, it divides every observation by the maximum
value of the variable

In [22]:
def abs_transformer(dataframe):
    norm_data = pd.DataFrame()
    for col in dataframe:
        col_max = np.max(dataframe[col])
        norm_data[col] = dataframe[col]/col_max

    return norm_data

In [23]:
abs_transformer(data)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.000071,0.18,0.083273,0.0,0.617681,0.748861,0.652,0.337278,0.041667,0.416315,0.695455,1.000000,0.131156
1,0.000307,0.00,0.254867,0.0,0.538462,0.731321,0.789,0.409607,0.083333,0.340366,0.809091,1.000000,0.240716
2,0.000307,0.00,0.254867,0.0,0.538462,0.818337,0.611,0.409607,0.083333,0.340366,0.809091,0.989746,0.106136
3,0.000364,0.00,0.078587,0.0,0.525832,0.797039,0.458,0.499913,0.125000,0.312236,0.850000,0.994281,0.077430
4,0.000776,0.00,0.078587,0.0,0.525832,0.814009,0.542,0.499913,0.125000,0.312236,0.850000,1.000000,0.140374
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000704,0.00,0.430065,0.0,0.657865,0.750911,0.691,0.204395,0.041667,0.383966,0.954545,0.987629,0.254675
502,0.000509,0.00,0.430065,0.0,0.657865,0.697039,0.767,0.188636,0.041667,0.383966,0.954545,1.000000,0.239136
503,0.000683,0.00,0.430065,0.0,0.657865,0.794533,0.910,0.178741,0.041667,0.383966,0.954545,1.000000,0.148538
504,0.001232,0.00,0.430065,0.0,0.657865,0.773804,0.893,0.196998,0.041667,0.383966,0.954545,0.991308,0.170661


In [25]:
mean_scaler = StandardScaler(with_mean=True,with_std=False)
abs_scaler = MaxAbsScaler()

mean_scaler.fit(data)
abs_scaler.fit(data)
abs_scaler.transform(mean_scaler.transform(data))

array([[-0.04054122,  0.06636364, -0.31819678, ..., -0.14343335,
         0.10135039, -0.20208226],
       [-0.04030531, -0.11363636, -0.14660341, ..., -0.02979698,
         0.10135039, -0.09252208],
       [-0.04030554, -0.11363636, -0.14660341, ..., -0.02979698,
         0.09109591, -0.22710201],
       ...,
       [-0.03992937, -0.11363636,  0.02859486, ...,  0.11565756,
         0.10135039, -0.18470011],
       [-0.03938057, -0.11363636,  0.02859486, ...,  0.11565756,
         0.09265802, -0.16257738],
       [-0.04007941, -0.11363636,  0.02859486, ...,  0.11565756,
         0.10135039, -0.12570617]])

## Scaling with the median and quantiles
When scaling variables to the median and quantiles, the median value is removed from the observations and the result is
divided by the inter-quartile range (IQR). The IQR is the range between the 1st quartile and the 3rd quartile, or, in
other words, the range between the 25th quantile and the 75th quantile

In [27]:
def med_q_transformer(dataframe):
    norm_data = pd.DataFrame()
    for col in dataframe:
        col_median = np.median(dataframe[col])
        col_iqr = ss.iqr(dataframe[col])
        norm_data[col] = (dataframe[col] - col_median)/col_iqr

    return norm_data

In [28]:
med_q_transformer(data)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.069593,1.44,-0.571650,,0.000000,0.496612,-0.250765,0.285777,-0.20,-0.087855,-1.339286,0.261902,-0.637681
1,-0.063755,0.00,-0.202943,,-0.394286,0.287940,0.028542,0.569789,-0.15,-0.227390,-0.446429,0.261902,-0.221889
2,-0.063760,0.00,-0.202943,,-0.394286,1.323171,-0.334353,0.569789,-0.15,-0.227390,-0.446429,0.066675,-0.732634
3,-0.062347,0.00,-0.581720,,-0.457143,1.069783,-0.646279,0.924391,-0.10,-0.279070,-0.125000,0.153016,-0.841579
4,-0.052144,0.00,-0.581720,,-0.457143,1.271680,-0.475025,0.924391,-0.10,-0.279070,-0.125000,0.261902,-0.602699
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,-0.053930,0.00,0.173509,,0.200000,0.521003,-0.171254,-0.236007,-0.20,-0.147287,0.696429,0.026382,-0.168916
502,-0.058759,0.00,0.173509,,0.200000,-0.119919,-0.016310,-0.297887,-0.20,-0.147287,0.696429,0.261902,-0.227886
503,-0.054450,0.00,0.173509,,0.200000,1.039973,0.275229,-0.336744,-0.20,-0.147287,0.696429,0.261902,-0.571714
504,-0.040867,0.00,0.173509,,0.200000,0.793360,0.240571,-0.265053,-0.20,-0.147287,0.696429,0.096414,-0.487756


In [29]:
scaler = RobustScaler()
scaler.fit(data)
scaler.scale_

array([3.5950375e+00, 1.2500000e+01, 1.2910000e+01, 1.0000000e+00,
       1.7500000e-01, 7.3800000e-01, 4.9050000e+01, 3.0882500e+00,
       2.0000000e+01, 3.8700000e+02, 2.8000000e+00, 2.0847500e+01,
       1.0005000e+01])

In [30]:
scaler.transform(data)

array([[-0.06959315,  1.44      , -0.57164988, ..., -1.33928571,
         0.26190191, -0.63768116],
       [-0.06375455,  0.        , -0.20294345, ..., -0.44642857,
         0.26190191, -0.22188906],
       [-0.06376011,  0.        , -0.20294345, ..., -0.44642857,
         0.06667466, -0.73263368],
       ...,
       [-0.05445006,  0.        ,  0.17350891, ...,  0.69642857,
         0.26190191, -0.57171414],
       [-0.04086745,  0.        ,  0.17350891, ...,  0.69642857,
         0.09641444, -0.48775612],
       [-0.05816351,  0.        ,  0.17350891, ...,  0.69642857,
         0.26190191, -0.34782609]])

## Scaling to vector unit length
When scaling to vector unit length, we transform the components of a feature vector so that the transformed vector has a
length of 1, or in other words, a norm of 1. Note that this scaling technique scales the feature vector, as opposed to
each individual variable. Scaling to the unit norm is achieved by dividing each observation vector by either the
Manhattan distance (l1 norm) or the Euclidean distance (l2 norm) of the vector. The Manhattan distance is given by the
sum of the absolute components of the vector.

In [32]:
def unit_len_transformer(dataframe,method='l1'):
    norm_data = pd.DataFrame()
    for col in dataframe:
        if method is 'l1':
            divider = np.sum(np.abs(dataframe[col]))
        elif method is 'l2':
            divider = np.sum(np.square(dataframe[col]))
        norm_data[col] = dataframe[col]/divider

    return norm_data

In [33]:
unit_len_transformer(data)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.000003,0.00313,0.000410,0.0,0.001917,0.002068,0.001879,0.002130,0.000207,0.001433,0.001638,0.002199,0.000778
1,0.000015,0.00000,0.001255,0.0,0.001671,0.002019,0.002274,0.002587,0.000414,0.001172,0.001906,0.002199,0.001428
2,0.000015,0.00000,0.001255,0.0,0.001671,0.002259,0.001761,0.002587,0.000414,0.001172,0.001906,0.002177,0.000629
3,0.000018,0.00000,0.000387,0.0,0.001632,0.002201,0.001320,0.003157,0.000621,0.001075,0.002002,0.002187,0.000459
4,0.000038,0.00000,0.000387,0.0,0.001632,0.002247,0.001562,0.003157,0.000621,0.001075,0.002002,0.002199,0.000832
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000034,0.00000,0.002117,0.0,0.002042,0.002073,0.001991,0.001291,0.000207,0.001322,0.002249,0.002172,0.001510
502,0.000025,0.00000,0.002117,0.0,0.002042,0.001925,0.002210,0.001191,0.000207,0.001322,0.002249,0.002199,0.001418
503,0.000033,0.00000,0.002117,0.0,0.002042,0.002194,0.002623,0.001129,0.000207,0.001322,0.002249,0.002199,0.000881
504,0.000060,0.00000,0.002117,0.0,0.002042,0.002136,0.002574,0.001244,0.000207,0.001322,0.002249,0.002180,0.001012


In [34]:
unit_len_transformer(data,'l2')

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1.437332e-07,0.000053,0.000027,0.0,0.003311,0.000325,0.000023,0.000429,0.000012,0.000003,0.000088,0.000006,0.000047
1,6.211004e-07,0.000000,0.000082,0.0,0.002887,0.000317,0.000028,0.000521,0.000024,0.000002,0.000102,0.000006,0.000086
2,6.206456e-07,0.000000,0.000082,0.0,0.002887,0.000355,0.000022,0.000521,0.000024,0.000002,0.000102,0.000006,0.000038
3,7.361780e-07,0.000000,0.000025,0.0,0.002819,0.000346,0.000016,0.000636,0.000036,0.000002,0.000107,0.000006,0.000028
4,1.570377e-06,0.000000,0.000025,0.0,0.002819,0.000353,0.000019,0.000636,0.000036,0.000002,0.000107,0.000006,0.000050
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,1.424369e-06,0.000000,0.000138,0.0,0.003527,0.000326,0.000025,0.000260,0.000012,0.000003,0.000120,0.000006,0.000091
502,1.029558e-06,0.000000,0.000138,0.0,0.003527,0.000302,0.000028,0.000240,0.000012,0.000003,0.000120,0.000006,0.000085
503,1.381840e-06,0.000000,0.000138,0.0,0.003527,0.000345,0.000033,0.000228,0.000012,0.000003,0.000120,0.000006,0.000053
504,2.492362e-06,0.000000,0.000138,0.0,0.003527,0.000336,0.000032,0.000251,0.000012,0.000003,0.000120,0.000006,0.000061


In [37]:
scaler = Normalizer(norm='l1')
scaler.fit(data)
scaler.transform(data)

array([[7.79381588e-06, 2.21975769e-02, 2.84868903e-03, ...,
        1.88679404e-02, 4.89456570e-01, 6.14132960e-03],
       [3.56669706e-05, 0.00000000e+00, 9.23344863e-03, ...,
        2.32468721e-02, 5.18353007e-01, 1.19368770e-02],
       [3.69043915e-05, 0.00000000e+00, 9.56079325e-03, ...,
        2.40710212e-02, 5.31225801e-01, 5.44978738e-03],
       ...,
       [7.49894545e-05, 0.00000000e+00, 1.47239005e-02, ...,
        2.59180142e-02, 4.89850469e-01, 6.96083810e-03],
       [1.35963442e-04, 0.00000000e+00, 1.48010207e-02, ...,
        2.60537666e-02, 4.88135927e-01, 8.03944798e-03],
       [5.91393858e-05, 0.00000000e+00, 1.48815202e-02, ...,
        2.61954673e-02, 4.95094331e-01, 9.82953724e-03]])