#### ***We will understand the difference among these three techniques:***

1. LabelEncoder and OneHotEncoder

2. DictVectorizer

3. Pandas get_dummies


In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

np.random.seed(123)


In [2]:
np.random.seed(1)
random_values1 = np.random.choice([1, 2, 3], size=20)

np.random.seed(2)
random_values2 = np.random.choice([1, 2], size=20)

print('Random values 1: ', random_values1) 
print()
print('Random values 2: ', random_values2) 

Random values 1:  [2 1 1 2 2 1 1 2 1 2 1 3 2 3 1 3 2 3 1 1]

Random values 2:  [1 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 1]


In [39]:
df = pd.DataFrame({'Var1': random_values1, 'Var2': random_values2}, dtype='category')

df

Unnamed: 0,Var1,Var2
0,2,1
1,1,2
2,1,2
3,2,1
4,2,1
5,1,2
6,1,1
7,2,2
8,1,1
9,2,2


#### ***Instantiate LabelEncoder object***
---

In [40]:
categ_le = LabelEncoder() 

df_labelEncoder = df.apply(lambda x: categ_le.fit_transform(x))
df_labelEncoder

Unnamed: 0,Var1,Var2
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0
5,0,1
6,0,0
7,1,1
8,0,0
9,1,1


#### ***Instantiate OneHotEncoder object***
---

#### df_labelEncoder

In [41]:
categ_ohe = OneHotEncoder(sparse_output=False).fit(df_labelEncoder)

# Transform the data
df_onehotencoder = categ_ohe.transform(df_labelEncoder)

# Get the feature names
feature_names = categ_ohe.get_feature_names_out()

print(feature_names)
print()
print(df_onehotencoder)

['Var1_0' 'Var1_1' 'Var1_2' 'Var2_0' 'Var2_1']

[[0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1.]
 [1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0.]
 [0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0.]
 [0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 1.]
 [0. 0. 1. 0. 1.]
 [1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 1.]
 [0. 0. 1. 0. 1.]
 [1. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0.]]


#### df

In [42]:
categ_ohe = OneHotEncoder(sparse_output=False).fit(df)

# Transform the data
df_onehotencoder = categ_ohe.transform(df)

# Get the feature names
feature_names = categ_ohe.get_feature_names_out()

print(feature_names)
print()
print(df_onehotencoder)

['Var1_1' 'Var1_2' 'Var1_3' 'Var2_1' 'Var2_2']

[[0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1.]
 [1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0.]
 [0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0.]
 [0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 1.]
 [0. 0. 1. 0. 1.]
 [1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 1.]
 [0. 0. 1. 0. 1.]
 [1. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0.]]


#### ***Instantiate DictVectorizer object***
---

In [46]:
df = df.astype(str)

In [47]:
df_dict = df.to_dict(orient='records') 

df_dict

[{'Var1': '2', 'Var2': '1'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '2', 'Var2': '1'},
 {'Var1': '2', 'Var2': '1'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '1', 'Var2': '2'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '2', 'Var2': '2'},
 {'Var1': '3', 'Var2': '2'},
 {'Var1': '1', 'Var2': '1'},
 {'Var1': '1', 'Var2': '1'}]

In [48]:
categ_dicVec = DictVectorizer(sparse=False).fit(df_dict)

# Transform the data
df_dicVec = categ_dicVec.transform(df_dict)

print(categ_dicVec.vocabulary_)
print(categ_dicVec.feature_names_)
print()

df_dicVec 

{'Var1=1': 0, 'Var1=2': 1, 'Var1=3': 2, 'Var2=1': 3, 'Var2=2': 4}
['Var1=1', 'Var1=2', 'Var1=3', 'Var2=1', 'Var2=2']



array([[0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.]])

#### ***get_dummies object***
---

In [49]:
X = pd.get_dummies(df, prefix_sep='_', drop_first=True, dtype=int)

X

Unnamed: 0,Var1_2,Var1_3,Var2_2
0,1,0,0
1,0,0,1
2,0,0,1
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,0
7,1,0,1
8,0,0,0
9,1,0,1
