## Feature Encoding
Feature encoding is a technique used to transform raw data into a numerical representation that can be used by machine
learning algorithms. There are several types of feature encoding, including:
such as :

label Encoding 
Ordinal Encoding
One hot encoding 
Binary Encoding 

In [43]:
import pandas as pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [44]:
## Data load
df=sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [45]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [46]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [47]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [48]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [49]:
## Lets encode the time in labelencoder with sklearn

from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder

le=LabelEncoder()
df['encoded_time']=le.fit_transform(df['time'])
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.50,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0
242,17.82,1.75,Male,No,Sat,Dinner,2,0


In [50]:
## ordinal coding the day column using specific order
oe=OrdinalEncoder(categories=[['Thur','Fri','Sat','Sun']])
df['encoded_day']=oe.fit_transform(df[['day']])
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day
0,16.99,1.01,Female,No,Sun,Dinner,2,0,3.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,3.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,3.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,3.0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0,2.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0,2.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0,2.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0,2.0


In [51]:
df[['day','encoded_day']].value_counts()

day   encoded_day
Sat   2.0            87
Sun   3.0            76
Thur  0.0            62
Fri   1.0            19
Name: count, dtype: int64

In [52]:
## one hot encoding on day column
one=OneHotEncoder()
one.fit_transform(df[['day']]).toarray()

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [53]:
## example of one hot encoding
import pandas as pd
titanic=sns.load_dataset('titanic')
ohe=OneHotEncoder(sparse=False)
embarked_ohe=ohe.fit_transform(titanic[['embarked']])
embarked_ohe_df=pd.DataFrame(embarked_ohe,columns=ohe.get_feature_names_out(['embarked']))
titanic=pd.concat([titanic.reset_index(drop=True),embarked_ohe_df.reset_index(drop=True)],axis=1)
titanic.tail()



Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,embarked_C,embarked_Q,embarked_S,embarked_nan
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True,0.0,0.0,1.0,0.0
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True,0.0,0.0,1.0,0.0
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False,0.0,0.0,1.0,0.0
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True,1.0,0.0,0.0,0.0
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,0.0,1.0,0.0,0.0


In [54]:
## Binary Encoding
!pip install category_encoders



DEPRECATION: Loading egg at c:\python312\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [55]:
df=sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [56]:
df.sample()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
108,18.24,3.76,Male,No,Sat,Dinner,2


In [57]:
from category_encoders import BinaryEncoder

bn=BinaryEncoder()
df_binary=bn.fit_transform(df['day'])
df_binary


Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


In [58]:
## Using Pandas
df=sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [59]:

pd.get_dummies(df['day'])

Unnamed: 0,Thur,Fri,Sat,Sun
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
...,...,...,...,...
239,False,False,True,False
240,False,False,True,False
241,False,False,True,False
242,False,False,True,False


In [60]:
pd.get_dummies(df['sex'])

Unnamed: 0,Male,Female
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True
...,...,...
239,True,False
240,False,True
241,True,False
242,True,False


In [62]:
# use pandas dummies
ge_dummies=pd.get_dummies(df,columns=['day'])
ge_dummies.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Thur,day_Fri,day_Sat,day_Sun
0,16.99,1.01,Female,No,Dinner,2,False,False,False,True
1,10.34,1.66,Male,No,Dinner,3,False,False,False,True
2,21.01,3.5,Male,No,Dinner,3,False,False,False,True
3,23.68,3.31,Male,No,Dinner,2,False,False,False,True
4,24.59,3.61,Female,No,Dinner,4,False,False,False,True
