In [11]:
import numpy as np
import pandas as pd
import scipy.stats as ss
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler,RobustScaler

In [3]:
data_set = load_boston()
data = pd.DataFrame(data_set['data'],columns=data_set['feature_names'])
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


## Standardizing the features
Standardization is the process of centering the variable at zero and standardizing the variance to 1.

In [4]:
scaler = StandardScaler()
scaler.fit(data)

StandardScaler()

In [5]:
scaler.scale_

array([8.59304135e+00, 2.32993957e+01, 6.85357058e+00, 2.53742935e-01,
       1.15763115e-01, 7.01922514e-01, 2.81210326e+01, 2.10362836e+00,
       8.69865112e+00, 1.68370495e+02, 2.16280519e+00, 9.12046075e+01,
       7.13400164e+00])

In [8]:
scaler.transform(data)

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

## Performing mean normalization
In mean normalization, we center the variable at zero and rescale the distribution to the value range. This procedure
involves subtracting the mean from each observation and then dividing the result by the difference between the minimum
and maximum values.

In [9]:
def mean_normalizer(dataframe):
    norm_data = pd.DataFrame()
    for col in dataframe:
        col_mean = np.mean(dataframe[col])
        col_min = np.min(dataframe[col])
        col_max = np.max(dataframe[col])
        norm_data[col] = (dataframe[col] - col_mean)/(col_max - col_min)

    return norm_data

In [10]:
mean_normalizer(data)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.040544,0.066364,-0.323562,-0.06917,-0.034352,0.055636,-0.034757,0.026822,-0.371713,-0.214193,-0.335695,0.101432,-0.211729
1,-0.040308,-0.113636,-0.149075,-0.06917,-0.176327,0.026129,0.106335,0.106581,-0.328235,-0.317246,-0.069738,0.101432,-0.096939
2,-0.040308,-0.113636,-0.149075,-0.06917,-0.176327,0.172517,-0.076981,0.106581,-0.328235,-0.317246,-0.069738,0.091169,-0.237943
3,-0.040251,-0.113636,-0.328328,-0.06917,-0.198961,0.136686,-0.234551,0.206163,-0.284757,-0.355414,0.026007,0.095708,-0.268021
4,-0.039839,-0.113636,-0.328328,-0.06917,-0.198961,0.165236,-0.148042,0.206163,-0.284757,-0.355414,0.026007,0.101432,-0.202071
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,-0.039911,-0.113636,0.029077,-0.06917,0.037664,0.059085,0.005408,-0.119710,-0.371713,-0.258086,0.270688,0.089051,-0.082314
502,-0.040106,-0.113636,0.029077,-0.06917,0.037664,-0.031545,0.083678,-0.137088,-0.371713,-0.258086,0.270688,0.101432,-0.098594
503,-0.039932,-0.113636,0.029077,-0.06917,0.037664,0.132471,0.230948,-0.148000,-0.371713,-0.258086,0.270688,0.101432,-0.193517
504,-0.039383,-0.113636,0.029077,-0.06917,0.037664,0.097598,0.213441,-0.127867,-0.371713,-0.258086,0.270688,0.092733,-0.170338


In [17]:
mean_scaler = StandardScaler(with_mean=True,with_std=False)
mean_scaler.fit(data)
mean_scaler.mean_

array([3.61352356e+00, 1.13636364e+01, 1.11367787e+01, 6.91699605e-02,
       5.54695059e-01, 6.28463439e+00, 6.85749012e+01, 3.79504269e+00,
       9.54940711e+00, 4.08237154e+02, 1.84555336e+01, 3.56674032e+02,
       1.26530632e+01])

In [18]:
mean_scaler.transform(data)

array([[ -3.60720356,   6.63636364,  -8.82677866, ...,  -3.1555336 ,
         40.22596838,  -7.67306324],
       [ -3.58621356, -11.36363636,  -4.06677866, ...,  -0.6555336 ,
         40.22596838,  -3.51306324],
       [ -3.58623356, -11.36363636,  -4.06677866, ...,  -0.6555336 ,
         36.15596838,  -8.62306324],
       ...,
       [ -3.55276356, -11.36363636,   0.79322134, ...,   2.5444664 ,
         40.22596838,  -7.01306324],
       [ -3.50393356, -11.36363636,   0.79322134, ...,   2.5444664 ,
         36.77596838,  -6.17306324],
       [ -3.56611356, -11.36363636,   0.79322134, ...,   2.5444664 ,
         40.22596838,  -4.77306324]])

In [19]:
range_scaler = RobustScaler(with_centering=False,with_scaling=True,quantile_range=(0,100))
range_scaler.fit(data)
range_scaler.scale_

array([8.896988e+01, 1.000000e+02, 2.728000e+01, 1.000000e+00,
       4.860000e-01, 5.219000e+00, 9.710000e+01, 1.099690e+01,
       2.300000e+01, 5.240000e+02, 9.400000e+00, 3.965800e+02,
       3.624000e+01])

In [20]:
range_scaler.transform(data)

array([[7.10352762e-05, 1.80000000e-01, 8.46774194e-02, ...,
        1.62765957e+00, 1.00080690e+00, 1.37417219e-01],
       [3.06957815e-04, 0.00000000e+00, 2.59164223e-01, ...,
        1.89361702e+00, 1.00080690e+00, 2.52207506e-01],
       [3.06733020e-04, 0.00000000e+00, 2.59164223e-01, ...,
        1.89361702e+00, 9.90544153e-01, 1.11203091e-01],
       ...,
       [6.82927750e-04, 0.00000000e+00, 4.37316716e-01, ...,
        2.23404255e+00, 1.00080690e+00, 1.55629139e-01],
       [1.23176518e-03, 0.00000000e+00, 4.37316716e-01, ...,
        2.23404255e+00, 9.92107519e-01, 1.78807947e-01],
       [5.32876969e-04, 0.00000000e+00, 4.37316716e-01, ...,
        2.23404255e+00, 1.00080690e+00, 2.17439294e-01]])

In [21]:
range_scaler.transform(mean_scaler.transform(data))

array([[-0.0405441 ,  0.06636364, -0.32356227, ..., -0.33569506,
         0.10143217, -0.21172912],
       [-0.04030818, -0.11363636, -0.14907546, ..., -0.06973762,
         0.10143217, -0.09693883],
       [-0.0403084 , -0.11363636, -0.14907546, ..., -0.06973762,
         0.09116942, -0.23794325],
       ...,
       [-0.03993221, -0.11363636,  0.02907703, ...,  0.27068792,
         0.10143217, -0.1935172 ],
       [-0.03938337, -0.11363636,  0.02907703, ...,  0.27068792,
         0.09273279, -0.17033839],
       [-0.04008226, -0.11363636,  0.02907703, ...,  0.27068792,
         0.10143217, -0.13170704]])