## Load the standard libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load the data

In [69]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Drop the missing values

In [70]:
data = data.dropna()
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Seperate X and y 

In [71]:
X = data.drop('Purchased', axis = 1)
y = data['Purchased']

## Split the data into train set and test set

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Feature Scaling Techniques

Note: Feature Scaling is only applied on Numerical columns

There are 3 types of Feature Scaling techniques
1. StandardScaler
2. RobustScaler
3. MinMaxScaler

## 1. Standard Scaling

In [39]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss

In [18]:
X_train[['Age', 'Salary']] = ss.fit_transform(X_train[['Age', 'Salary']])
X_train

Unnamed: 0,Country,Age,Salary
0,France,0.777714,0.638977
9,France,-0.311086,0.120887
2,Germany,-1.399885,-1.226144
5,France,-0.622171,-0.811673
3,Spain,-0.155543,-0.500819
8,Germany,1.71097,1.778773


## How does Standard Scaler operate?

In [19]:
li = [525, 750, 350]

li_1 = 525
li_2 = 750
li_3 = 350

In [21]:
np.mean(li)

541.6666666666666

In [23]:
np.std(li)

163.72402253657083

In [25]:
(525 - np.mean(li)) / np.std(li)

-0.10179731971185729

In [26]:
(750 - np.mean(li)) / np.std(li)

1.2724664963982193

In [27]:
(350 - np.mean(li)) / np.std(li)

-1.1706691766863613

In [51]:
X_test

Unnamed: 0,Country,Age,Salary
1,Spain,27.0,48000.0
7,France,48.0,79000.0


In [48]:
np.mean(data['Age'])

38.625

In [49]:
data['Age'].std()

8.210402808980463

In [50]:
(27.0 - np.mean(data['Age']))/ data['Age'].std()

-1.4158866830851078

In [32]:
(48 - 37.5) / 14.84

0.7075471698113207

## Feature Scaling the X_test set

In [33]:
X_test[['Age', 'Salary']] = ss.fit_transform(X_test[['Age', 'Salary']])
X_test

Unnamed: 0,Country,Age,Salary
1,Spain,-1.0,-1.0
7,France,1.0,1.0


## 2. MinMax Scaling

In [58]:
X_train

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
9,France,37.0,67000.0
2,Germany,30.0,54000.0
5,France,35.0,58000.0
3,Spain,38.0,61000.0
8,Germany,50.0,83000.0


In [59]:
X_test

Unnamed: 0,Country,Age,Salary
1,Spain,27.0,48000.0
7,France,48.0,79000.0


In [60]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [61]:
X_train[['Age', 'Salary']]  = mms.fit_transform(X_train[['Age', 'Salary']])
X_train

Unnamed: 0,Country,Age,Salary
0,France,0.7,0.62069
9,France,0.35,0.448276
2,Germany,0.0,0.0
5,France,0.25,0.137931
3,Spain,0.4,0.241379
8,Germany,1.0,1.0


## How does this operate?

In [62]:
li = [525, 450, 750]
li

[525, 450, 750]

In [63]:
np.max(li)

750

In [64]:
np.min(li)

450

In [65]:
(525 - np.min(li)) / (np.max(li) - np.min(li))

0.25

In [66]:
(450 - np.min(li)) / (np.max(li) - np.min(li))

0.0

In [67]:
(750 - np.min(li)) / (np.max(li) - np.min(li))

1.0

## Fit the MinMaxScaler on X_test set 

In [68]:
X_test[['Age', 'Salary']] = mms.fit_transform(X_test[['Age', 'Salary']])
X_test

Unnamed: 0,Country,Age,Salary
1,Spain,0.0,0.0
7,France,1.0,1.0


## Robust Scaling

In [73]:
X_train

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
9,France,37.0,67000.0
2,Germany,30.0,54000.0
5,France,35.0,58000.0
3,Spain,38.0,61000.0
8,Germany,50.0,83000.0


In [74]:
X_test

Unnamed: 0,Country,Age,Salary
1,Spain,27.0,48000.0
7,France,48.0,79000.0


## Apply Robust Scaling on train data

In [75]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs

In [76]:
X_train[['Age', 'Salary']] = rs.fit_transform(X_train[['Age', 'Salary']])
X_train

Unnamed: 0,Country,Age,Salary
0,France,0.928571,0.666667
9,France,-0.071429,0.25
2,Germany,-1.071429,-0.833333
5,France,-0.357143,-0.5
3,Spain,0.071429,-0.25
8,Germany,1.785714,1.583333


## How does Robust Scaling operate

In [77]:
li

[525, 450, 750]

In [78]:
np.median(li)

525.0

In [80]:
iqr = np.quantile(li, 0.75) - np.quantile(li, 0.25)
iqr

150.0

In [82]:
(525 - np.median(li)) / iqr

0.0

In [83]:
(450 - np.median(li)) / iqr

-0.5

In [85]:
(750 - np.median(li)) / iqr

1.5

## Apply the robust Scaling on test set

In [86]:
X_test[['Age', 'Salary']] = rs.fit_transform(X_test[['Age', 'Salary']])
X_test

Unnamed: 0,Country,Age,Salary
1,Spain,-1.0,-1.0
7,France,1.0,1.0
