# one hot encoding

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('datasets/Employee.csv')
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [3]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [4]:
df['Gender'].value_counts()

Male      2778
Female    1875
Name: Gender, dtype: int64

In [5]:
df['Education'].unique()

array(['Bachelors', 'Masters', 'PHD'], dtype=object)

In [6]:
df['Education'].value_counts()

Bachelors    3601
Masters       873
PHD           179
Name: Education, dtype: int64

In [7]:
df['City'].unique()

array(['Bangalore', 'Pune', 'New Delhi'], dtype=object)

In [8]:
df['City'].value_counts()

Bangalore    2228
Pune         1268
New Delhi    1157
Name: City, dtype: int64

In [9]:
df['EverBenched'].unique()

array(['No', 'Yes'], dtype=object)

In [10]:
df['EverBenched'].value_counts()

No     4175
Yes     478
Name: EverBenched, dtype: int64

## pandas one hot encoding

In [11]:
df2 = pd.get_dummies(df, columns=['Gender','City'])
df2.head()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,EverBenched,ExperienceInCurrentDomain,LeaveOrNot,Gender_Female,Gender_Male,City_Bangalore,City_New Delhi,City_Pune
0,Bachelors,2017,3,34,No,0,0,0,1,1,0,0
1,Bachelors,2013,1,28,No,3,1,1,0,0,0,1
2,Bachelors,2014,3,38,No,2,0,1,0,0,1,0
3,Masters,2016,3,27,No,5,1,0,1,1,0,0
4,Masters,2017,3,24,Yes,2,1,0,1,0,0,1


In [12]:
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


## scikit learn one hot encoding

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
encoder = OneHotEncoder()

In [15]:
encoder.fit(df[['Gender','Education']])
encoder.categories_

[array(['Female', 'Male'], dtype=object),
 array(['Bachelors', 'Masters', 'PHD'], dtype=object)]

In [16]:
arr = encoder.transform(df[['Gender','Education']]).toarray()
print(arr)

[[0. 1. 1. 0. 0.]
 [1. 0. 1. 0. 0.]
 [1. 0. 1. 0. 0.]
 ...
 [0. 1. 0. 1. 0.]
 [0. 1. 1. 0. 0.]
 [0. 1. 1. 0. 0.]]


In [17]:
arr2 = encoder.inverse_transform(arr)
print(arr2)

[['Male' 'Bachelors']
 ['Female' 'Bachelors']
 ['Female' 'Bachelors']
 ...
 ['Male' 'Masters']
 ['Male' 'Bachelors']
 ['Male' 'Bachelors']]


In [18]:
encoded_features = encoder.get_feature_names_out(['Gender','Education'])

In [19]:
df3 = pd.DataFrame()
df3[encoded_features] = pd.DataFrame(arr)

In [20]:
df3.head()

Unnamed: 0,Gender_Female,Gender_Male,Education_Bachelors,Education_Masters,Education_PHD
0,0.0,1.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0


In [21]:
df4 = pd.concat([df,df3])
df4.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot,Gender_Female,Gender_Male,Education_Bachelors,Education_Masters,Education_PHD
0,Bachelors,2017.0,Bangalore,3.0,34.0,Male,No,0.0,0.0,,,,,
1,Bachelors,2013.0,Pune,1.0,28.0,Female,No,3.0,1.0,,,,,
2,Bachelors,2014.0,New Delhi,3.0,38.0,Female,No,2.0,0.0,,,,,
3,Masters,2016.0,Bangalore,3.0,27.0,Male,No,5.0,1.0,,,,,
4,Masters,2017.0,Pune,3.0,24.0,Male,Yes,2.0,1.0,,,,,


## encoding option 2

In [22]:
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [23]:
df5 = pd.DataFrame()
df5['Gender'] = df['Gender'].astype('category')
df5['Education'] = df['Education'].astype('category')
df5['Gender_new'] = df5['Gender'].cat.codes
df5['Education_new'] = df5['Education'].cat.codes
df5.head()

Unnamed: 0,Gender,Education,Gender_new,Education_new
0,Male,Bachelors,1,0
1,Female,Bachelors,0,0
2,Female,Bachelors,0,0
3,Male,Masters,1,1
4,Male,Masters,1,1


In [24]:
df6 = pd.DataFrame()

In [25]:
encoder2 = OneHotEncoder()
arr3 = encoder2.fit_transform(df5[['Gender_new','Education_new']]).toarray()
encoded_features2 = encoder2.get_feature_names_out(['Gender_new','Education_new'])
df6[encoded_features2] = pd.DataFrame(arr3)
df6.head()

Unnamed: 0,Gender_new_0,Gender_new_1,Education_new_0,Education_new_1,Education_new_2
0,0.0,1.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0


In [26]:
df7 = pd.DataFrame()
df7 = df.join(df6)
df7.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot,Gender_new_0,Gender_new_1,Education_new_0,Education_new_1,Education_new_2
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0,0.0,1.0,1.0,0.0,0.0
1,Bachelors,2013,Pune,1,28,Female,No,3,1,1.0,0.0,1.0,0.0,0.0
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0,1.0,0.0,1.0,0.0,0.0
3,Masters,2016,Bangalore,3,27,Male,No,5,1,0.0,1.0,0.0,1.0,0.0
4,Masters,2017,Pune,3,24,Male,Yes,2,1,0.0,1.0,0.0,1.0,0.0


In [27]:
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


## example one hot encoding

In [28]:
enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male', 'a'], ['Female', 'b'], ['Female', 'b']]
enc.fit(X)
enc.categories_

[array(['Female', 'Male'], dtype=object), array(['a', 'b'], dtype=object)]

In [29]:
arr3 = enc.transform([['Female', 'b'], ['Male', 'b'], ['Male', 'd']]).toarray()
print(arr3)

[[1. 0. 0. 1.]
 [0. 1. 0. 1.]
 [0. 1. 0. 0.]]


In [30]:
enc.inverse_transform(arr3)

array([['Female', 'b'],
       ['Male', 'b'],
       ['Male', None]], dtype=object)

In [31]:
enc.get_feature_names_out(['gender', 'group'])

array(['gender_Female', 'gender_Male', 'group_a', 'group_b'], dtype=object)