# Data Encoding
1. Nominal/OHE Encoding
2. Label or Ordinal Encoding
3. Target Guided Ordinal Encoding

## Nominal / One Hot Encoding
It is a technique used to represent categorical data as a numerical data which is more suitable for machine learning algorithms

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Create a simple dataframe
df = pd.DataFrame(
    {'color': ['red', 'blue', 'green', 'green', 'red', 'blue']}
)
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue


In [3]:
# Create an instance of One Hot Encoder
encoder = OneHotEncoder()

In [4]:
# Perform fit and transform
encoded = encoder.fit_transform(df[['color']]).toarray()

In [5]:
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [6]:
import seaborn as sns
tips_df = sns.load_dataset('tips')

In [7]:
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [11]:
# sex_encoder = OneHotEncoder()
# smoker_encoder = OneHotEncoder()
# days_encoder = OneHotEncoder()
# time_encoder = OneHotEncoder()
tips_encoder = OneHotEncoder()

In [12]:
# encoded_sex_data = sex_encoder.fit_transform(tips_df[['sex']]).toarray()
# encoded_smoker_data = smoker_encoder.fit_transform(tips_df[['smoker']]).toarray()
# encoded_days_data = days_encoder.fit_transform(tips_df[['day']]).toarray()
# encoded_time_data = time_encoder.fit_transform(tips_df['time']).toarray()
encoded_tips_data = tips_encoder.fit_transform(tips_df[['sex', 'smoker', 'day', 'time']]).toarray()

In [14]:
encoded_df = pd.DataFrame(encoded_tips_data, columns=tips_encoder.get_feature_names_out())
encoded_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [19]:
final_df = pd.concat([tips_df, encoded_df], axis=1).drop(['sex', 'smoker', 'day', 'time'], axis=1)
final_df

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.50,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,27.18,2.00,2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,22.67,2.00,2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,17.82,1.75,2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
