##### $\hspace{15pt}$ **Filename: dataStandardization.ipynb**
##### $\hspace{1.5pt}$ **Date Created: October 30, 2023**
##### **Date Modified: January 11, 2024**
##### $\rule{8.25in}{1pt}$
##### **Standardize data using basic `pandas` operations or using the `fit`, `transform`, and `fit_transform` methods of `sklearn.preprocessing.StandardScaler()`.**
##### $\rule{8.25in}{1pt}$

##### Load modules and packages.

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

##### Create data for demonstration.

In [None]:
np.random.seed(1)
data = pd.DataFrame(np.round(100*np.random.rand(10, 3), 2), columns = ["x1", "x2", "x3"])
data

Unnamed: 0,x1,x2,x3
0,41.7,72.03,0.01
1,30.23,14.68,9.23
2,18.63,34.56,39.68
3,53.88,41.92,68.52
4,20.45,87.81,2.74
5,67.05,41.73,55.87
6,14.04,19.81,80.07
7,96.83,31.34,69.23
8,87.64,89.46,8.5
9,3.91,16.98,87.81


$\hspace{1in}$

##### **Standardize data using basic Pandas operations.**

##### Calculate the mean of each column.

In [None]:
columnMeans = data.mean(axis = 0)
columnMeans

x1    43.436
x2    45.032
x3    42.166
dtype: float64

##### Calculate the standard deviation of each column.

In [None]:
columnSds = data.std(axis = 0, ddof = 0)
columnSds

x1    30.363047
x2    26.822986
x3    32.696756
dtype: float64

##### Standardize each column.

In [None]:
standardizedData_v1 = (data - columnMeans)/columnSds
standardizedData_v1

Unnamed: 0,x1,x2,x3
0,-0.057175,1.006525,-1.289302
1,-0.434937,-1.131567,-1.007317
2,-0.81698,-0.390411,-0.076032
3,0.343971,-0.11602,0.806013
4,-0.757039,1.594826,-1.205808
5,0.777722,-0.123103,0.419124
6,-0.968151,-0.940313,1.159259
7,1.758519,-0.510458,0.827727
8,1.455849,1.656341,-1.029643
9,-1.30178,-1.045819,1.395979


$\hspace{1in}$


##### **Standardize data using the fit and transform methods of sklearn.preprocessing.StandardScaler().**

##### Instantiate the StandardScaler class and fit the instance to the data.

In [None]:
scaler = StandardScaler()
scaler.fit(data)

##### Display the calculated mean of each column.

In [None]:
scaler.mean_

array([43.436, 45.032, 42.166])

##### Display the calculated standard deviation of each column.

In [None]:
scaler.scale_

array([30.36304734, 26.82298596, 32.69675617])

##### Apply the transform method to standardize each column.

In [None]:
standardizedData_v2 = pd.DataFrame(scaler.transform(data), columns = ["x1", "x2", "x3"])
standardizedData_v2

Unnamed: 0,x1,x2,x3
0,-0.057175,1.006525,-1.289302
1,-0.434937,-1.131567,-1.007317
2,-0.81698,-0.390411,-0.076032
3,0.343971,-0.11602,0.806013
4,-0.757039,1.594826,-1.205808
5,0.777722,-0.123103,0.419124
6,-0.968151,-0.940313,1.159259
7,1.758519,-0.510458,0.827727
8,1.455849,1.656341,-1.029643
9,-1.30178,-1.045819,1.395979


$\hspace{1in}$

##### **Standardize data using the fit_transform method of sklearn.preprocessing.StandardScaler().**

##### Instantiate the StandardScaler class and apply the fit_transform method to the data.

In [None]:
scaler = StandardScaler()
standardizedData_v3 = pd.DataFrame(scaler.fit_transform(data), columns = ["x1", "x2", "x3"])
standardizedData_v3

Unnamed: 0,x1,x2,x3
0,-0.057175,1.006525,-1.289302
1,-0.434937,-1.131567,-1.007317
2,-0.81698,-0.390411,-0.076032
3,0.343971,-0.11602,0.806013
4,-0.757039,1.594826,-1.205808
5,0.777722,-0.123103,0.419124
6,-0.968151,-0.940313,1.159259
7,1.758519,-0.510458,0.827727
8,1.455849,1.656341,-1.029643
9,-1.30178,-1.045819,1.395979
