In [46]:
# Data Encoding >> Converting categorical to numeric column
''' 
     1. Nominal/OHE
     2. Label and ordinal encoding
     3. Target guided ordinal encoding
'''

' \n     1. Nominal/OHE\n     2. Label and ordinal encoding\n     3. Target guided ordinal encoding\n'

# 1. Nominal/OHE >> binary vectors for each category

In [47]:
# single, married , in relationship
# single > [1,0,0]
# married > [0,1,0]
# relationship > [0,0,1]

In [48]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [49]:
df = pd.DataFrame({'status': ['Single', 'Married', 'Single', 'Relation', 'Married', 'Single']})

In [50]:
df

Unnamed: 0,status
0,Single
1,Married
2,Single
3,Relation
4,Married
5,Single


In [51]:
encoder = OneHotEncoder()
encoder

OneHotEncoder()

In [52]:
encoded = encoder.fit_transform(df[['status']]).toarray()

In [53]:
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [54]:
encoder.get_feature_names_out()

array(['status_Married', 'status_Relation', 'status_Single'], dtype=object)

In [55]:
pd.DataFrame(encoded, columns = encoder.get_feature_names_out())

Unnamed: 0,status_Married,status_Relation,status_Single
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0


In [56]:
#new data  >> single 
encoder.transform([['Single']]).toarray()



array([[0., 0., 1.]])

In [57]:
encoder.transform([['Married']]).toarray()



array([[1., 0., 0.]])

In [58]:
encoder.transform([['Relation']]).toarray()



array([[0., 1., 0.]])

In [59]:
encoder_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out())

In [60]:
encoder_df


Unnamed: 0,status_Married,status_Relation,status_Single
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0


In [61]:
df

Unnamed: 0,status
0,Single
1,Married
2,Single
3,Relation
4,Married
5,Single


In [62]:
final_df = pd.concat([df, encoder_df], axis=1)

In [63]:
final_df

Unnamed: 0,status,status_Married,status_Relation,status_Single
0,Single,0.0,0.0,1.0
1,Married,1.0,0.0,0.0
2,Single,0.0,0.0,1.0
3,Relation,0.0,1.0,0.0
4,Married,1.0,0.0,0.0
5,Single,0.0,0.0,1.0


In [64]:
final_df.drop(['status', 'status_Married'], axis=1, inplace=True)

In [65]:
final_df

Unnamed: 0,status_Relation,status_Single
0,0.0,1.0
1,0.0,0.0
2,0.0,1.0
3,1.0,0.0
4,0.0,0.0
5,0.0,1.0


In [71]:
pd.get_dummies(df['status'], drop_first=True) 

Unnamed: 0,Relation,Single
0,0,1
1,0,0
2,0,1
3,1,0
4,0,0
5,0,1


# Label encoding >> assign unique label to the categories

In [73]:
df

Unnamed: 0,status
0,Single
1,Married
2,Single
3,Relation
4,Married
5,Single


In [74]:
from sklearn.preprocessing import LabelEncoder

In [75]:
label_encoder = LabelEncoder()

In [76]:
label_encoder.fit_transform(df[['status']])

  y = column_or_1d(y, warn=True)


array([2, 0, 2, 1, 0, 2])

# Ordinal encoding

In [77]:
from sklearn.preprocessing import OrdinalEncoder

In [81]:
df = pd.DataFrame({"qualification" : ["HS","PG","UG","HS","phD","HS","PG"]})

In [82]:
df

Unnamed: 0,qualification
0,HS
1,PG
2,UG
3,HS
4,phD
5,HS
6,PG


In [85]:
encoder = OrdinalEncoder(categories=[["HS","UG","PG","phD"]])

In [88]:
encoder.fit_transform(df[['qualification']])

array([[0.],
       [2.],
       [1.],
       [0.],
       [3.],
       [0.],
       [2.]])

# Target Guided encoding

In [91]:
df = pd.DataFrame({'time':['lunch','breakfast','dinner','lunch','breakfast','dinner','lunch','breakfast','dinner'],
             'total_bill': [120,120,40,150,160,130,40,150,160]})

In [92]:
df

Unnamed: 0,time,total_bill
0,lunch,120
1,breakfast,120
2,dinner,40
3,lunch,150
4,breakfast,160
5,dinner,130
6,lunch,40
7,breakfast,150
8,dinner,160


In [96]:
mean_price = df.groupby('time')['total_bill'].mean().to_dict()

In [97]:
mean_price

{'breakfast': 143.33333333333334, 'dinner': 110.0, 'lunch': 103.33333333333333}

In [99]:
df['time_target_encoding'] = df['time'].map(mean_price)

In [100]:
df['time_target_encoding']

0    103.333333
1    143.333333
2    110.000000
3    103.333333
4    143.333333
5    110.000000
6    103.333333
7    143.333333
8    110.000000
Name: time_target_encoding, dtype: float64

In [101]:
df

Unnamed: 0,time,total_bill,time_target_encoding
0,lunch,120,103.333333
1,breakfast,120,143.333333
2,dinner,40,110.0
3,lunch,150,103.333333
4,breakfast,160,143.333333
5,dinner,130,110.0
6,lunch,40,103.333333
7,breakfast,150,143.333333
8,dinner,160,110.0
