# Feature Encoding

Feature encoding is the process of transforming `categorical features` into `numeric features`. This is necessary because machine learning algorithms can only handle numeric features. There are many different ways to encode categorical features, and each method has its own advantages and disadvantages. In this notebook, we will explore some of the most popular methods for encoding categorical features, such as:

1. Label encoding
2. Ordinal encoding
3. One-hot encoding
4. Binary encoding
5. Manual Encoding

In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
# data load
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

# 1. Label Encoding

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['time_encoded'] = le.fit_transform(df['time'])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.5,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0


# 2. Ordinal Encoding

In [7]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [16]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Thur', 'Fri', 'Sat', 'Sun']]) #order 
df['encoded_days'] = oe.fit_transform(df[['day']])
df['encoded_days'].value_counts()

encoded_days
2.0    87
3.0    76
0.0    62
1.0    19
Name: count, dtype: int64

# 3. One hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
# OneHotEncoder is used to convert categorical variables into a one-hot numeric array.
# The parameter sparse_output=False returns a dense numpy array instead of a sparse matrix.
ohe = OneHotEncoder(sparse_output=False)

In [40]:
sex_ohe = ohe.fit_transform(df[['sex']])
df[['Female', 'Male']] = sex_ohe
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,Female,Male
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0.0,1.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0


In [46]:
titanic = sns.load_dataset('titanic')
titanic.dropna(subset=['embarked'], inplace=True)  # Drop rows with NaN in 'embarked'
titanic['embarked'].value_counts()
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [56]:
ohe = OneHotEncoder(sparse_output=False)
embarked_ohe = ohe.fit_transform(titanic[['embarked']])
titanic[['C', 'Q', 'S']] = embarked_ohe
titanic[['C', 'Q', 'S']].value_counts()

C    Q    S  
0.0  0.0  1.0    644
1.0  0.0  0.0    168
0.0  1.0  0.0     77
Name: count, dtype: int64

# 4. Binary Encoding

In [1]:
!pip install category_encoders



In [2]:
import category_encoders

In [5]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [12]:
from category_encoders import BinaryEncoder

binary_encoder = BinaryEncoder()
encoded_smoker = binary_encoder.fit_transform(df['smoker'])
df = pd.concat([df, encoded_smoker], axis=1)
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,smoker_0,smoker_1,smoker_0.1,smoker_1.1,smoker_0.2,smoker_1.2
239,29.03,5.92,Male,No,Sat,Dinner,3,0,1,0,1,0,1
240,27.18,2.0,Female,Yes,Sat,Dinner,2,1,0,1,0,1,0
241,22.67,2.0,Male,Yes,Sat,Dinner,2,1,0,1,0,1,0
242,17.82,1.75,Male,No,Sat,Dinner,2,0,1,0,1,0,1
243,18.78,3.0,Female,No,Thur,Dinner,2,0,1,0,1,0,1


# 5. Using pandas' get_dummies function

In [25]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [26]:
df['embark_town'].value_counts()

embark_town
Southampton    644
Cherbourg      168
Queenstown      77
Name: count, dtype: int64

In [27]:
df = pd.get_dummies(df, columns=['embark_town'] , dtype=int)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alive,alone,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,no,False,0,0,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,yes,False,1,0,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,yes,True,0,0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,yes,False,0,0,1
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,no,True,0,0,1


# 6. Manual Encoding

In [29]:
# manual encoding using pandas
df = sns.load_dataset('tips')
df['day_encoded'] = df['day'].map({'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3})
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,day_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,3
1,10.34,1.66,Male,No,Sun,Dinner,3,3
2,21.01,3.50,Male,No,Sun,Dinner,3,3
3,23.68,3.31,Male,No,Sun,Dinner,2,3
4,24.59,3.61,Female,No,Sun,Dinner,4,3
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,2
240,27.18,2.00,Female,Yes,Sat,Dinner,2,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2,2
242,17.82,1.75,Male,No,Sat,Dinner,2,2
