>StandardScaler

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

print('Mean Values of each Features:\n', iris_df.mean(), sep='')
print('Variance of each Features:\n', iris_df.var(), sep='')

Mean Values of each Features:
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64
Variance of each Features:
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [6]:
from sklearn.preprocessing import StandardScaler

stand_scaler = StandardScaler()
stand_scaler.fit(iris_df)
scaled_iris_data = stand_scaler.transform(iris_df)
print('type of scaled_iris_data:', type(scaled_iris_data))

scaled_iris_df = pd.DataFrame(data=scaled_iris_data, columns=iris.feature_names)
print('#Standard Scaling Completed')
print('Mean Values of each Features:\n', scaled_iris_df.mean(), sep='')
print('Variances of each Features:\n', scaled_iris_df.var(), sep='')
scaled_iris_df.head()

type of scaled_iris_data: <class 'numpy.ndarray'>
#Standard Scaling Completed
Mean Values of each Features:
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64
Variances of each Features:
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


>MinMaxScaler

In [9]:
from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()
mm_scaler.fit(iris_df)
scaled_iris_data2 = mm_scaler.transform(iris_df)

scaled_iris_df2 = pd.DataFrame(data=scaled_iris_data2, columns=iris.feature_names)
print('#MinMax Scaling Completed')
print('Mean Values of each Features:\n', scaled_iris_df2.mean(), sep='')
print('Variances of each Features:\n', scaled_iris_df2.var(), sep='')
print('Min Values of each Features:\n', scaled_iris_df2.min(), sep='')
print('Max Values of each Features:\n', scaled_iris_df2.max(), sep='')


#MinMax Scaling Completed
Mean Values of each Features:
sepal length (cm)    0.428704
sepal width (cm)     0.440556
petal length (cm)    0.467458
petal width (cm)     0.458056
dtype: float64
Variances of each Features:
sepal length (cm)    0.052908
sepal width (cm)     0.032983
petal length (cm)    0.089522
petal width (cm)     0.100869
dtype: float64
Min Values of each Features:
sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
Max Values of each Features:
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


>Points to Note when using sklearn scaling

In [14]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

train_array = np.arange(0,11).reshape(-1,1)
test_array = np.arange(0,6).reshape(-1,1)

scaler = MinMaxScaler()
scaler.fit(train_array)
train_scaled = scaler.transform(train_array)

print('Original train_array:\n', np.round(train_array.reshape(-1), 4))
print('Scaled train_array:\n', np.round(train_scaled.reshape(-1), 4))



Original train_array:
 [ 0  1  2  3  4  5  6  7  8  9 10]
Scaled train_array:
 [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [15]:
#Wrong Example: fit(test_array)
scaler.fit(test_array)
test_scaled = scaler.transform(test_array)

print('Original test_array:\n', np.round(test_array.reshape(-1), 4))
print('Scaled test_array:\n', np.round(test_scaled.reshape(-1), 4))

Original test_array:
 [0 1 2 3 4 5]
Scaled test_array:
 [0.  0.2 0.4 0.6 0.8 1. ]


In [16]:
#Correct Example
train_array = np.arange(0,11).reshape(-1,1)
test_array = np.arange(0,6).reshape(-1,1)

scaler = MinMaxScaler()
scaler.fit(train_array)
train_scaled = scaler.transform(train_array)

print('Original train_array:\n', np.round(train_array.reshape(-1), 4))
print('Scaled train_array:\n', np.round(train_scaled.reshape(-1), 4))

test_scaled = scaler.transform(test_array)

print('Original test_array:\n', np.round(test_array.reshape(-1), 4))
print('Scaled test_array:\n', np.round(test_scaled.reshape(-1), 4))

#Two ways to avoid scaing miss
#1. If possible, apply scaling for whole dataset first, then split to train and test dataset
#2. If it is difficult to apply 1, DO NOT FIT TEST DATA 

Original train_array:
 [ 0  1  2  3  4  5  6  7  8  9 10]
Scaled train_array:
 [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
Original test_array:
 [0 1 2 3 4 5]
Scaled test_array:
 [0.  0.1 0.2 0.3 0.4 0.5]
