## Dataset

In [2]:
import pandas as pd

df = pd.DataFrame({
    'Country': ['Indonesia', 'USA', 'Japan', 'China', 'Indonesia', 'China', 'Indonesia', 'USA', 'China'],
    'Age': [28, 32, 31, 24, 44, 35, 54, 29, 38],
    'Salary': [3000, 15000, 12000, 14000, 5000, 14500, 5400, 17000, 16000]
})

df

Unnamed: 0,Country,Age,Salary
0,Indonesia,28,3000
1,USA,32,15000
2,Japan,31,12000
3,China,24,14000
4,Indonesia,44,5000
5,China,35,14500
6,Indonesia,54,5400
7,USA,29,17000
8,China,38,16000


# Label Encoding pada Scikit Learn

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])
df

Unnamed: 0,Country,Age,Salary
0,1,28,3000
1,3,32,15000
2,2,31,12000
3,0,24,14000
4,1,44,5000
5,0,35,14500
6,1,54,5400
7,3,29,17000
8,0,38,16000


In [7]:
label_encoder.classes_

array(['China', 'Indonesia', 'Japan', 'USA'], dtype=object)

# One Hot Encoding

In [8]:
import pandas as pd

df = pd.DataFrame({
    'Country': ['Indonesia', 'USA', 'Japan', 'China', 'Indonesia', 'China', 'Indonesia', 'USA', 'China'],
    'Age': [28, 32, 31, 24, 44, 35, 54, 29, 38],
    'Salary': [3000, 15000, 12000, 14000, 5000, 14500, 5400, 17000, 16000]
})

df

Unnamed: 0,Country,Age,Salary
0,Indonesia,28,3000
1,USA,32,15000
2,Japan,31,12000
3,China,24,14000
4,Indonesia,44,5000
5,China,35,14500
6,Indonesia,54,5400
7,USA,29,17000
8,China,38,16000


In [9]:
X = df['Country'].values.reshape(-1, 1)
X

array([['Indonesia'],
       ['USA'],
       ['Japan'],
       ['China'],
       ['Indonesia'],
       ['China'],
       ['Indonesia'],
       ['USA'],
       ['China']], dtype=object)

In [12]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()
X = onehot_encoder.fit_transform(X).toarray()
X

array([[1., 0., 0., 1., 1., 0., 1., 0.],
       [1., 0., 1., 0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0., 1., 1., 0.],
       [0., 1., 1., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 1., 0., 1., 0.],
       [0., 1., 1., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 1., 0., 1., 0.],
       [1., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 1., 0., 1., 0., 1., 0.]])

In [13]:
onehot_encoder.categories_

[array([0., 1.]), array([0., 1.]), array([0., 1.]), array([0., 1.])]

In [14]:
df_onehot = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
df_onehot

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
5,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
6,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
7,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
8,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [15]:
df = pd.concat([df_onehot, df], axis=1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,Country,Age,Salary
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Indonesia,28,3000
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,USA,32,15000
2,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,Japan,31,12000
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,China,24,14000
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Indonesia,44,5000
5,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,China,35,14500
6,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Indonesia,54,5400
7,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,USA,29,17000
8,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,China,38,16000


In [18]:
df = df.drop(['Country'], axis=1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,Age,Salary
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,28,3000
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,32,15000
2,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,31,12000
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,24,14000
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,44,5000
5,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,35,14500
6,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,54,5400
7,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,29,17000
8,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,38,16000
