### **더미화 - 원핫 인코딩(One-Hot encoding)**
#### N개의 클래스를 N 차원의 One-Hot 벡터로 표현되도록 변환
#### 고유값들을 피처로 만들고 정답에 해당하는 열은 1로 나머진 0으로 표시
#### 변환해야 하는 값의 종류가 여러 개일 때
#### 숫자의 차이가 모델에 영향을 미치는 선형 계열 모델(로지스틱회귀, SVM, 신경망)에서 범주형 데이터 변환시 라벨 인코딩 보다 원핫 인코딩을 사용

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.DataFrame({'item' : ['TV', '냉장고', '전자레인지', '컴퓨터', 'TV', '선풍기', '선풍기', '믹서', '믹서']})
display(df)

Unnamed: 0,item
0,TV
1,냉장고
2,전자레인지
3,컴퓨터
4,TV
5,선풍기
6,선풍기
7,믹서
8,믹서


In [3]:
pd.get_dummies(df)

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자레인지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,1,0,0,0,0,0
5,0,0,0,1,0,0
6,0,0,0,1,0,0
7,0,0,1,0,0,0
8,0,0,1,0,0,0


In [4]:
pd.get_dummies(df, prefix="my")

Unnamed: 0,my_TV,my_냉장고,my_믹서,my_선풍기,my_전자레인지,my_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,1,0,0,0,0,0
5,0,0,0,1,0,0
6,0,0,0,1,0,0
7,0,0,1,0,0,0
8,0,0,1,0,0,0


In [5]:
onehot = pd.get_dummies(df, prefix="my")
pd.concat([df, onehot], axis = 1)

Unnamed: 0,item,my_TV,my_냉장고,my_믹서,my_선풍기,my_전자레인지,my_컴퓨터
0,TV,1,0,0,0,0,0
1,냉장고,0,1,0,0,0,0
2,전자레인지,0,0,0,0,1,0
3,컴퓨터,0,0,0,0,0,1
4,TV,1,0,0,0,0,0
5,선풍기,0,0,0,1,0,0
6,선풍기,0,0,0,1,0,0
7,믹서,0,0,1,0,0,0
8,믹서,0,0,1,0,0,0


In [6]:
fruit = pd.DataFrame({'name':['apple', 'banana', 'cherry', 'durian', np.nan],
                      'color':['red', 'yellow', 'red', 'green', np.nan]})
fruit

Unnamed: 0,name,color
0,apple,red
1,banana,yellow
2,cherry,red
3,durian,green
4,,


In [7]:
pd.get_dummies(fruit)

Unnamed: 0,name_apple,name_banana,name_cherry,name_durian,color_green,color_red,color_yellow
0,1,0,0,0,0,1,0
1,0,1,0,0,0,0,1
2,0,0,1,0,0,1,0
3,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0


In [8]:
display(pd.get_dummies(fruit, columns = ['name']))
display(pd.get_dummies(fruit['name']))

Unnamed: 0,color,name_apple,name_banana,name_cherry,name_durian
0,red,1,0,0,0
1,yellow,0,1,0,0
2,red,0,0,1,0
3,green,0,0,0,1
4,,0,0,0,0


Unnamed: 0,apple,banana,cherry,durian
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,0,0,0,0


In [9]:
pd.get_dummies(fruit['name'], dummy_na = True)

Unnamed: 0,apple,banana,cherry,durian,NaN
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,0,1


In [10]:
# Titanic 데이터 호출
df = sns.load_dataset('titanic')
display(df.head())

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [11]:
# 분석에 활용할 열(속성)을 선택 
ndf = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked']]

# 원핫인코딩 - 범주형 데이터를 모형이 인식할 수 있도록 숫자형으로 변환
onehot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, onehot_sex], axis=1)

onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis=1)

ndf.drop(['sex', 'embarked'], axis=1, inplace=True)

display(ndf)

Unnamed: 0,survived,pclass,age,sibsp,parch,female,male,town_C,town_Q,town_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,0,1,0,0,1
887,1,1,19.0,0,0,1,0,0,0,1
888,0,3,,1,2,1,0,0,0,1
889,1,1,26.0,0,0,0,1,1,0,0
