In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [27]:
data = pd.read_csv("Data.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,,Yes
2,Germany,30.0,54000.0,No
3,Spain,,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,,Yes
6,,40.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,,83000.0,No
9,France,37.0,67000.0,Yes


In [28]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')
si

In [29]:
data[['Country', 'Age', 'Salary']] = si.fit_transform(data[['Country', 'Age', 'Salary']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,52000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,40.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,52000.0,Yes
6,France,40.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,40.0,83000.0,No
9,France,37.0,67000.0,Yes


In [46]:
df = data.copy()
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,52000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,40.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,52000.0,Yes
6,France,40.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,40.0,83000.0,No
9,France,37.0,67000.0,Yes


# Feature Scaling
- Bringing down range of numerical variables to a common scale.

In [16]:
data['Age'].min(), data['Age'].max()

(27.0, 49.0)

In [17]:
data['Salary'].min(), data['Salary'].max()

(52000.0, 83000.0)

- Age column values are in between 27 to 49
- Salary column values are in between 52000 to 83000.

- When Machine learning algorithm is applied on such dataset having different scale columns, the algorithm will give high priority to Salary as the Salary has bigger values and low priority to Age column as it has smaller values.

# To fix this issue we perform Feature Scaling on the dataset

### Types of Feature Scaling:
1. Standard Scaling - Standardization
2. Min Max Scaling - Normalization
3. Maximum Absolute Scaling(Max Abs Scaling)
4. Robust Scaling

# Standard Scaling - Standardization:
- We try to bring all the numerical variables to a similar scale by centering the variable at zero. Mean = 0 and variance = 1
- z = (x - x_mean) / std

In [18]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,52000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,40.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,52000.0,Yes
6,France,40.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,40.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss

In [20]:
data[['Age', 'Salary']]= ss.fit_transform(data[['Age', 'Salary']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.76337,0.86284,No
1,Spain,-1.880152,-0.962399,Yes
2,Germany,-1.413648,-0.779875,No
3,Spain,0.141365,-0.141041,No
4,Germany,0.141365,0.132745,Yes
5,France,-0.636142,-0.962399,Yes
6,France,0.141365,-0.962399,No
7,France,1.385375,1.501674,Yes
8,Germany,0.141365,1.866722,No
9,France,-0.325139,0.406531,Yes


# 2. Min - Max Scaling - Normalization

- used to bring down the scale of the value of numerical to range of 0 to 1.
- X - X_min / X_max - X_min

In [31]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,52000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,40.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,52000.0,Yes
6,France,40.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,40.0,83000.0,No
9,France,37.0,67000.0,Yes


In [32]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [33]:
df[['Age', 'Salary']] = mms.fit_transform(df[['Age', 'Salary']])
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.772727,0.645161,No
1,Spain,0.0,0.0,Yes
2,Germany,0.136364,0.064516,No
3,Spain,0.590909,0.290323,No
4,Germany,0.590909,0.387097,Yes
5,France,0.363636,0.0,Yes
6,France,0.590909,0.0,No
7,France,0.954545,0.870968,Yes
8,Germany,0.590909,1.0,No
9,France,0.454545,0.483871,Yes


# Max Abs Scaling:

- Similar to Standard Scaler, but it scales each feature by dividing it with the maximum absolute value.
- X - X_mean / abs(X_max)
- This is useful when you have a lot of outliers in the data

In [39]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,52000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,40.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,52000.0,Yes
6,France,40.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,40.0,83000.0,No
9,France,37.0,67000.0,Yes


In [37]:
from sklearn.preprocessing import MaxAbsScaler
mas = MaxAbsScaler()
mas

In [41]:
df[['Age', 'Salary']] = mas.fit_transform(df[['Age', 'Salary']])
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.897959,0.86747,No
1,Spain,0.55102,0.626506,Yes
2,Germany,0.612245,0.650602,No
3,Spain,0.816327,0.73494,No
4,Germany,0.816327,0.771084,Yes
5,France,0.714286,0.626506,Yes
6,France,0.816327,0.626506,No
7,France,0.979592,0.951807,Yes
8,Germany,0.816327,1.0,No
9,France,0.755102,0.807229,Yes


# Robust Scaler

- It is used to scale the features to median and quantiles. Scaling using median and quantiles consists of subtracting the median to all the observations, then dividing by the interquartile difference. The interquartile difference is the difference between the 75th and the 25th quantile

- Rob_Scaler = (X - X_median) / IQR
- IQR = 75%ile - 25%ile

In [44]:
data[['Age', 'Salary']] = data[['Age', 'Salary']].astype('float')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    11 non-null     object 
 1   Age        11 non-null     float64
 2   Salary     11 non-null     float64
 3   Purchased  11 non-null     object 
dtypes: float64(2), object(2)
memory usage: 480.0+ bytes


In [45]:
data.describe()

Unnamed: 0,Age,Salary
count,11.0,11.0
mean,39.090909,62545.454545
std,6.744695,11492.289906
min,27.0,52000.0
25%,36.0,52000.0
50%,40.0,61000.0
75%,42.0,69500.0
max,49.0,83000.0


In [47]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,52000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,40.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,52000.0,Yes
6,France,40.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,40.0,83000.0,No
9,France,37.0,67000.0,Yes


In [48]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs

In [49]:
df[['Age', 'Salary']] = rs.fit_transform(df[['Age', 'Salary']])
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,0.666667,0.628571,No
1,Spain,-2.166667,-0.514286,Yes
2,Germany,-1.666667,-0.4,No
3,Spain,0.0,0.0,No
4,Germany,0.0,0.171429,Yes
5,France,-0.833333,-0.514286,Yes
6,France,0.0,-0.514286,No
7,France,1.333333,1.028571,Yes
8,Germany,0.0,1.257143,No
9,France,-0.5,0.342857,Yes


- Any Scaling technique bring down the values centered to zero removing that precedence of high priority or low priority
- Once Scaling is done.All columns are equal.

In [51]:
tips = sns.load_dataset("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [53]:
tips['tip'].agg(['min', 'max'])

min     1.0
max    10.0
Name: tip, dtype: float64

In [54]:
tips['total_bill'].agg(['min', 'max'])

min     3.07
max    50.81
Name: total_bill, dtype: float64

In [55]:
tips[['tip', 'total_bill']] = ss.fit_transform(tips[['tip', 'total_bill']])
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,-0.314711,-1.439947,Female,No,Sun,Dinner,2
1,-1.063235,-0.969205,Male,No,Sun,Dinner,3
2,0.137780,0.363356,Male,No,Sun,Dinner,3
3,0.438315,0.225754,Male,No,Sun,Dinner,2
4,0.540745,0.443020,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,1.040511,2.115963,Male,No,Sat,Dinner,3
240,0.832275,-0.722971,Female,Yes,Sat,Dinner,2
241,0.324630,-0.722971,Male,Yes,Sat,Dinner,2
242,-0.221287,-0.904026,Male,No,Sat,Dinner,2
