#### One Hot Encoding & Label Encoding
- 머신러닝은 숫자를 기반으로 값을 예측하기 때문에, 문자열은 숫자로 변환이 필요
- 이를 인코딩이라 하며, 일반적으로 One Hot Encoding 과 Label Encoding 이 많이 쓰임
    - One Hot Encoding:
        - 각 문자열마다 새로운  feature 생성 후, 해당하는 값에만 1을 부여 
    - Label Encoding:
        - 기존의 feature engineering 방식
        - 문자열을 Label 로 Encoding 하는 함수도 존재(Label Encoder())
        - 선형 회귀와 같이 숫자에 의미를 부여하는 경우에는 예측을 왜곡시킬 수 있음
        - tree 관련 머신러닝에서는 사용 가능

In [7]:
import pickle
import pandas as pd
with open('titanic_step2_feature_engineering.pickle', 'rb') as pickle_file:
    df = pickle.load(pickle_file)

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_Initial2,Ticket_Num_Cut,HighSurvival,LowSurvival
0,1,0.0,3,0,3.0,0.0,0,0,0.0,2,0,3.0,0,0
1,2,1.0,1,1,4.0,3.0,1,1,2.0,2,1,3.0,2,0
2,3,1.0,3,1,3.0,0.0,0,0,1.0,1,2,9.0,0,0
3,4,1.0,1,1,4.0,2.0,1,0,2.0,2,3,5.0,2,0
4,5,0.0,3,0,4.0,0.0,0,0,0.0,1,3,9.0,0,2


In [9]:
df_label = df.copy()
df_onehot = df.copy()
ntrain = 891
train = df[:ntrain]
y_train = train[["Survived"]].copy()

In [10]:
df_label = df.drop(["PassengerId", "Survived"], axis=1).copy()
df_label.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_Initial2,Ticket_Num_Cut,HighSurvival,LowSurvival
0,3,0,3.0,0.0,0,0,0.0,2,0,3.0,0,0
1,1,1,4.0,3.0,1,1,2.0,2,1,3.0,2,0
2,3,1,3.0,0.0,0,0,1.0,1,2,9.0,0,0
3,1,1,4.0,2.0,1,0,2.0,2,3,5.0,2,0
4,3,0,4.0,0.0,0,0,0.0,1,3,9.0,0,2


In [11]:
df_onehot = df.drop(["PassengerId", "Survived"], axis=1).copy()
df_onehot.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_Initial2,Ticket_Num_Cut,HighSurvival,LowSurvival
0,3,0,3.0,0.0,0,0,0.0,2,0,3.0,0,0
1,1,1,4.0,3.0,1,1,2.0,2,1,3.0,2,0
2,3,1,3.0,0.0,0,0,1.0,1,2,9.0,0,0
3,1,1,4.0,2.0,1,0,2.0,2,3,5.0,2,0
4,3,0,4.0,0.0,0,0,0.0,1,3,9.0,0,2


#### 레이블 인코딩

In [15]:
from sklearn.preprocessing import LabelEncoder

In [14]:
df_label = df_label.apply(LabelEncoder().fit_transform)
df_label.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Initial,Family,Ticket_Initial2,Ticket_Num_Cut,HighSurvival,LowSurvival
0,2,0,3,0,0,0,0,1,1,3,0,0
1,0,1,4,3,1,1,2,1,2,3,2,0
2,2,1,3,0,0,0,1,0,3,9,0,0
3,0,1,4,2,1,0,2,1,4,5,2,0
4,2,0,4,0,0,0,0,0,4,9,0,2


#### 원핫 인코딩
- 범주가 많을 경우, 머신러닝 계산 소요 시간이 오래 걸리고, 불필요한 feature 예측으로 성능이 오히려 안좋아질 수 있음

In [17]:
from sklearn.preprocessing import OneHotEncoder

onehot_cols = df_label.columns.tolist()
df_onehot = pd.get_dummies(df_label,columns=onehot_cols)

df_onehot.head()

Unnamed: 0,Pclass_0,Pclass_1,Pclass_2,Sex_0,Sex_1,Age_0,Age_1,Age_2,Age_3,Age_4,...,HighSurvival_0,HighSurvival_1,HighSurvival_2,HighSurvival_3,HighSurvival_4,HighSurvival_5,HighSurvival_6,LowSurvival_0,LowSurvival_1,LowSurvival_2
0,False,False,True,True,False,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
1,True,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,False
2,False,False,True,False,True,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
3,True,False,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,False,True,False,False
4,False,False,True,True,False,False,False,False,False,True,...,True,False,False,False,False,False,False,False,False,True


In [18]:
import pickle
import pandas as pd
with open('titanic_step3_feature_engineering.pickle', 'wb') as pickle_file:
    pickle.dump(df_onehot, pickle_file)

In [19]:
import pickle
import pandas as pd
with open('titanic_step3_feature_engineering_y.pickle', 'wb') as pickle_file:
    pickle.dump(y_train, pickle_file)