In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [6]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,38,61000,No
4,Germany,40,64000,Yes
5,France,35,58000,Yes
6,Spain,40,52000,No
7,France,48,79000,Yes
8,Germany,50,83000,No
9,France,37,67000,Yes


#### Age and Salary column are having values on a cmplete different scale. eg: The range of Age is 0 - 100. But the range of salary is 10000+ - 1000000+

#### Since the scale of these 2 numerical variable is different hence we perform Feature Scaling

### Types of Feature Scaling

1. Standard Scaler
2. Min Max Scaler
3. Robust Scaler 
4. Decimal Scaler

### Min Max Scaler

In [3]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [4]:
data['Age'] = mms.fit_transform(data[['Age']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.73913,72000,No
1,Spain,0.0,48000,Yes
2,Germany,0.130435,54000,No
3,Spain,0.478261,61000,No
4,Germany,0.565217,64000,Yes
5,France,0.347826,58000,Yes
6,Spain,0.565217,52000,No
7,France,0.913043,79000,Yes
8,Germany,1.0,83000,No
9,France,0.434783,67000,Yes


In [7]:
## Age Column first row value is 44
## Age column min = 27 and max = 50

In [8]:
(44 - 27) / (50 - 27)

0.7391304347826086

In [9]:
## Age Column second row value is 27
## Age column min = 27 and max = 50

In [10]:
(27 - 27) / (50 - 27)

0.0

In [11]:
### Salary first row = 72000
## Min Salary = 48000 & max salary = 98000

In [12]:
## (value - min) / (max - min)

(72000 - 48000) / (98000 - 48000)

0.48

In [14]:
data['Salary'] = mms.fit_transform(data[['Salary']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,0.48,No
1,Spain,27,0.0,Yes
2,Germany,30,0.12,No
3,Spain,38,0.26,No
4,Germany,40,0.32,Yes
5,France,35,0.2,Yes
6,Spain,40,0.08,No
7,France,48,0.62,Yes
8,Germany,50,0.7,No
9,France,37,0.38,Yes


## Standard Scaling

In [18]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,38,61000,No
4,Germany,40,64000,Yes
5,France,35,58000,Yes
6,Spain,40,52000,No
7,France,48,79000,Yes
8,Germany,50,83000,No
9,France,37,67000,Yes


In [16]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss

In [17]:
data[['Age', 'Salary']] = ss.fit_transform(data[['Age', 'Salary']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.582139,0.354629,No
1,Spain,-1.784381,-1.317195,Yes
2,Germany,-1.36676,-0.899239,No
3,Spain,-0.253104,-0.411623,No
4,Germany,0.02531,-0.202645,Yes
5,France,-0.670725,-0.620601,Yes
6,Spain,0.02531,-1.038557,No
7,France,1.138967,0.842245,Yes
8,Germany,1.417381,1.120882,No
9,France,-0.392311,0.006333,Yes


In [19]:
data['Age'].mean()

39.81818181818182

In [20]:
data['Age'].std()

7.534164609539426

In [21]:
## Standard Scaling formala = (value - mean) / std

In [22]:
(44 - 39.81818181818182) / 7.534164609539426

0.5550473607284007

In [23]:
data['Salary'].mean()

66909.09090909091

In [24]:
data['Salary'].std()

15056.258137097315

In [25]:
## Salary

(52000 - 66909.09) / 15056.2581

-0.9902254531622301

## Robust Scaling

In [31]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,38,61000,No
4,Germany,40,64000,Yes
5,France,35,58000,Yes
6,Spain,40,52000,No
7,France,48,79000,Yes
8,Germany,50,83000,No
9,France,37,67000,Yes


In [27]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs

In [28]:
data[['Age', 'Salary']] = rs.fit_transform(data[['Age', 'Salary']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.4,0.410256,No
1,Spain,-1.3,-0.820513,Yes
2,Germany,-1.0,-0.512821,No
3,Spain,-0.2,-0.153846,No
4,Germany,0.0,0.0,Yes
5,France,-0.5,-0.307692,Yes
6,Spain,0.0,-0.615385,No
7,France,0.8,0.769231,Yes
8,Germany,1.0,0.974359,No
9,France,-0.3,0.153846,Yes


## How does the robust scaling operate?

In [30]:
## robust scaling formula : (value - median) / iqr   iqr = interquartile range

## Formula for calculating iqr = q3 - q1 (75% - 25%)

In [32]:
data['Age'].median()

40.0

In [34]:
iqr = data['Age'].quantile(0.75) - data['Age'].quantile(0.25)
iqr

10.0

In [35]:
(50 - 40) / 10

1.0

## Decimal Scaling

In [40]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,38,61000,No
4,Germany,40,64000,Yes
5,France,35,58000,Yes
6,Spain,40,52000,No
7,France,48,79000,Yes
8,Germany,50,83000,No
9,France,37,67000,Yes


In [37]:
from sklearn.preprocessing import MaxAbsScaler
mas = MaxAbsScaler()
mas

In [38]:
data[['Age', 'Salary']] = mas.fit_transform(data[['Age', 'Salary']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.88,0.734694,No
1,Spain,0.54,0.489796,Yes
2,Germany,0.6,0.55102,No
3,Spain,0.76,0.622449,No
4,Germany,0.8,0.653061,Yes
5,France,0.7,0.591837,Yes
6,Spain,0.8,0.530612,No
7,France,0.96,0.806122,Yes
8,Germany,1.0,0.846939,No
9,France,0.74,0.683673,Yes


In [39]:
### Tell me the formula for Max Abs Scaler - take 3 mins and do google search

## value / max

In [41]:
40 / 50

0.8

## Summary of Scaling

- Decimal Scaling = value / max
- Min Max Scaling = (value - min) / (max - min)
- Standard Scaling = (value - mean) / std
- RobustScaling = (value - median) / iqr

#### Which is the best?
RobustScaling

#### For numerical variables present in the data check the values if the values are at different scale perform Feature Scaling and only then apply Machine Learning Algorithms

#### If the values of numerical variables are already scaled then directly apply ML algorithms

# Feature Encoding 
- Converting Text columns in the data to numerical columns so that ML algorithms can be applied on the data

- Types of Feature Encoding

1. OneHotEncoding - if the column contains names of an object(eg: names of country, names of colors, names of states, names of products, etc)
2. LabelEncoding - When the column is ordinal (eg: ratings of a movie(excellent, good, bad, worse), grades of a students(A, B, C, D ,etc.), education qualification(eg: Schooling, High Schooling, Degree, Professional Course, Master's, PHD))

## Apply OneHotEncoding on the Country column

In [54]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output = False)
ohe

In [55]:
data['Country'] = ohe.fit_transform(data[['Country']])
data['Country']

0     1.0
1     0.0
2     0.0
3     0.0
4     0.0
5     1.0
6     0.0
7     1.0
8     0.0
9     1.0
10    1.0
Name: Country, dtype: float64

In [58]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,38,61000,No
4,Germany,40,64000,Yes
5,France,35,58000,Yes
6,Spain,40,52000,No
7,France,48,79000,Yes
8,Germany,50,83000,No
9,France,37,67000,Yes


In [59]:
pd.get_dummies(data)

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44,72000,1,0,0,1,0
1,27,48000,0,0,1,0,1
2,30,54000,0,1,0,1,0
3,38,61000,0,0,1,1,0
4,40,64000,0,1,0,0,1
5,35,58000,1,0,0,0,1
6,40,52000,0,0,1,1,0
7,48,79000,1,0,0,0,1
8,50,83000,0,1,0,1,0
9,37,67000,1,0,0,0,1


### Apply ML algorithms only when the numerical columns are properly scaled and all the text columns are properly encoded using either labelencoder or onehotencoder

### Missing value treatment, Feature Scaling and Feature Encoding all belong to Data preprocessing

- Perform Missing Value treatment, Feature Scaling and Feature Encoding in order.

In [60]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le

In [61]:
le.fit_transform(data[['Country']])

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0, 0])

France = 0
Spain = 2
Germany = 1

In [62]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,38,61000,No
4,Germany,40,64000,Yes
5,France,35,58000,Yes
6,Spain,40,52000,No
7,France,48,79000,Yes
8,Germany,50,83000,No
9,France,37,67000,Yes
