### Standardization and normalization

In [20]:
import numpy as np
import pandas as pd

In [21]:
x = np.array([0,1,2,3,4,5])
print(x)

[0 1 2 3 4 5]


In [22]:
def standardize(a):
    return((a-np.mean(a))/np.std(a))

In [23]:
std_array = standardize(x)
print(std_array)

[-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]


In [24]:
def normalize(a):
    return((a-min(a))/(max(a)-min(a)))

In [25]:
norm_array = normalize(x)
print(norm_array)

[0.  0.2 0.4 0.6 0.8 1. ]


In [28]:
pd.Series(x)

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int32

In [51]:
df1 = pd.DataFrame({'X':x})
df2 = pd.DataFrame({'std_x':std_array})
df3 = pd.DataFrame({'nor_x':norm_array})

df = pd.concat([df1,df2,df3], axis = 1)
print(df)

   X    std_x  nor_x
0  0 -1.46385    0.0
1  1 -0.87831    0.2
2  2 -0.29277    0.4
3  3  0.29277    0.6
4  4  0.87831    0.8
5  5  1.46385    1.0


### using SkLearn

In [67]:
x_2d = [[0],[1],[2],[3],[4],[5]]

from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_std = std.fit_transform(x_2d)
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler()
x_norm = norm.fit_transform(x_2d)

print(x_std,"\n\n")
print(x_norm)

[[-1.46385011]
 [-0.87831007]
 [-0.29277002]
 [ 0.29277002]
 [ 0.87831007]
 [ 1.46385011]] 


[[0. ]
 [0.2]
 [0.4]
 [0.6]
 [0.8]
 [1. ]]


In [66]:
df1 = pd.DataFrame(x, columns = ["X"])
df2 = pd.DataFrame(x_std, columns = ["std_X"])
df3 = pd.DataFrame(x_norm, columns = ["norm_X"])
df = pd.concat([df1,df2,df3],axis = 1)
print(df)

   X    std_X  norm_X
0  0 -1.46385     0.0
1  1 -0.87831     0.2
2  2 -0.29277     0.4
3  3  0.29277     0.6
4  4  0.87831     0.8
5  5  1.46385     1.0


## Train test split

In [69]:
data = pd.concat([df, pd.DataFrame({"Target":[0,0,0,1,1,1]})], axis = 1)
print(data)

   X    std_X  norm_X  Target
0  0 -1.46385     0.0       0
1  1 -0.87831     0.2       0
2  2 -0.29277     0.4       0
3  3  0.29277     0.6       1
4  4  0.87831     0.8       1
5  5  1.46385     1.0       1


In [74]:
y = data["Target"]
X = data.drop(["Target"], axis=1)
print(X, "\n", X.shape)
print(y, "\n", y.shape)

   X    std_X  norm_X
0  0 -1.46385     0.0
1  1 -0.87831     0.2
2  2 -0.29277     0.4
3  3  0.29277     0.6
4  4  0.87831     0.8
5  5  1.46385     1.0 
 (6, 3)
0    0
1    0
2    0
3    1
4    1
5    1
Name: Target, dtype: int64 
 (6,)


In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state = 10)
print(X_train.shape)
print(X_test.shape)
print(X_train)
print(X_test)

(4, 3)
(2, 3)
   X    std_X  norm_X
0  0 -1.46385     0.0
3  3  0.29277     0.6
4  4  0.87831     0.8
1  1 -0.87831     0.2
   X    std_X  norm_X
2  2 -0.29277     0.4
5  5  1.46385     1.0
