#### **Written by: Yousuf Shah**
#### **Subject: Feature Normalization & Encoding**
#### **Date: 07.01.2024**
#### **Email: contact.ys09@gmail.com**

# Feature Normalization
---

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Standard Scalar Z-Score Normalization

In [2]:
# sample data
data={'score':[10, 20, 30, 40, 50]}
df=pd.DataFrame(data)

In [3]:
# z-score scaling
mean=df['score'].mean()
std=df['score'].std()
data_points=df

In [4]:
data_points

Unnamed: 0,score
0,10
1,20
2,30
3,40
4,50


In [5]:
# formulla of z-score scaling
z_score=(data_points-mean)/std

In [6]:
# showing result
z_score

Unnamed: 0,score
0,-1.264911
1,-0.632456
2,0.0
3,0.632456
4,1.264911


## 2. Min Max Scaling

In [7]:
# min max scaling and there formulla
min_max_scaling=(data_points-data_points['score'].min())/(data_points['score'].max()-data_points['score'].min())

In [8]:
# showing result
min_max_scaling

Unnamed: 0,score
0,0.0
1,0.25
2,0.5
3,0.75
4,1.0


## 3. Robust Scaling

In [9]:
# Data Points
data_points=df
# Find Median 
median=df['score'].median()
# First Quartile
Q1=df.quantile(0.25)
# Third Quartile
Q3=df.quantile(0.75)
# Inter Quartile Range
IQR=Q1-Q3
# Formulla of Robust Scaling
robust_scaling=(data_points-median)/IQR
# Showing result
robust_scaling

Unnamed: 0,score
0,1.0
1,0.5
2,-0.0
3,-0.5
4,-1.0


In [10]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

# sample data with outliers
data = {
    'value': [10, 20, 30, 40, 50, 10000]
}
df = pd.DataFrame(data)

# Robust Scaling
scaler = RobustScaler()
df['robust_value'] = scaler.fit_transform(df[['value']])

df




Unnamed: 0,value,robust_value
0,10,-1.0
1,20,-0.6
2,30,-0.2
3,40,0.2
4,50,0.6
5,10000,398.6


## 4. Logrithmic scaling/ Normalization

In [11]:
#random data with outliers
data = {'big_numbers': [10000, 20000, 30000, 1000000, 50000]}
df = pd.DataFrame(data)
df['log_numbers']=np.log(df['big_numbers'])
df['log2_numbers']=np.log2(df['big_numbers'])
df['log10_numbers']=np.log10(df['big_numbers'])
df

Unnamed: 0,big_numbers,log_numbers,log2_numbers,log10_numbers
0,10000,9.21034,13.287712,4.0
1,20000,9.903488,14.287712,4.30103
2,30000,10.308953,14.872675,4.477121
3,1000000,13.815511,19.931569,6.0
4,50000,10.819778,15.60964,4.69897


# feature Encoding

## 1. One hot encoding

In [12]:
# Sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red']}
df = pd.DataFrame(data)
print(df.head())
# One hot encoding
encoded_data=pd.get_dummies(df, columns=['Color'])
print(encoded_data)

   Color
0    Red
1  Green
2   Blue
3    Red
   Color_Blue  Color_Green  Color_Red
0       False        False       True
1       False         True      False
2        True        False      False
3       False        False       True


 # 2. Label Encoding

In [13]:
from sklearn.preprocessing import LabelEncoder
# Sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red'],
       'animal':['cat','dog','elephant','lion']}

df = pd.DataFrame(data)
print(df.head())
# Label encoding
label_encoding=LabelEncoder()
df['encoded_column_color']=label_encoding.fit_transform(df['Color'])
df['encoded_column_animal']=label_encoding.fit_transform(df['Color'])

print(df.head())

   Color    animal
0    Red       cat
1  Green       dog
2   Blue  elephant
3    Red      lion
   Color    animal  encoded_column_color  encoded_column_animal
0    Red       cat                     2                      2
1  Green       dog                     1                      1
2   Blue  elephant                     0                      0
3    Red      lion                     2                      2


# 3. Ordinal Encoding

In [14]:
from sklearn.preprocessing import OrdinalEncoder
# Sample data
data = {'Size': ['Small', 'Medium', 'Large', 'Medium']}
df = pd.DataFrame(data)
print(df)
# Ordinal Encoding
ordinal_encoding=OrdinalEncoder(categories=[['Small','Medium','Large']])
df['ordinal_encoded']=ordinal_encoding.fit_transform(df[['Size']])
df.head()


     Size
0   Small
1  Medium
2   Large
3  Medium


Unnamed: 0,Size,ordinal_encoded
0,Small,0.0
1,Medium,1.0
2,Large,2.0
3,Medium,1.0
