# 參考資料
* https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# 設定 data_path, 並讀取 app_train
dir_data = '../data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)

In [3]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## 作業
將下列部分資料片段 sub_train 使用 One Hot encoding, 並觀察轉換前後的欄位數量 (使用 shape) 與欄位名稱 (使用 head) 變化

In [4]:
sub_train = pd.DataFrame(app_train['WEEKDAY_APPR_PROCESS_START'])
print(sub_train.shape)
sub_train.head()

(307511, 1)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START
0,WEDNESDAY
1,MONDAY
2,MONDAY
3,WEDNESDAY
4,THURSDAY


In [5]:
sub_train.dtypes.value_counts()

object    1
dtype: int64

In [6]:
sub_train.select_dtypes(include=["object"]).apply(pd.Series.nunique, axis = 0)

WEEKDAY_APPR_PROCESS_START    7
dtype: int64

### 直接使用 sklearn preprocess 模組中的 OneHotEncoder 功能直接轉換編碼

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
ohe=OneHotEncoder()
ohe.fit(sub_train)

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [9]:
ohe_week=ohe.transform(sub_train).toarray()
ohe_week

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [10]:
ohe_week.shape

(307511, 7)

### 使用pandas 執行One Hot Encoding 編碼

In [11]:
sub_train.head()

Unnamed: 0,WEEKDAY_APPR_PROCESS_START
0,WEDNESDAY
1,MONDAY
2,MONDAY
3,WEDNESDAY
4,THURSDAY


In [12]:
sub_train_ohe=pd.get_dummies(sub_train)
sub_train_ohe.head()

Unnamed: 0,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0


In [13]:
sub_train_ohe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 7 columns):
WEEKDAY_APPR_PROCESS_START_FRIDAY       307511 non-null uint8
WEEKDAY_APPR_PROCESS_START_MONDAY       307511 non-null uint8
WEEKDAY_APPR_PROCESS_START_SATURDAY     307511 non-null uint8
WEEKDAY_APPR_PROCESS_START_SUNDAY       307511 non-null uint8
WEEKDAY_APPR_PROCESS_START_THURSDAY     307511 non-null uint8
WEEKDAY_APPR_PROCESS_START_TUESDAY      307511 non-null uint8
WEEKDAY_APPR_PROCESS_START_WEDNESDAY    307511 non-null uint8
dtypes: uint8(7)
memory usage: 2.1 MB


結論:
1.sub_train 中的'WEEKDAY_APPR_PROCESS_START'欄位內容是星期一到星期日等七個值，用 Pands 的pd.get_dummies
  功能執行One Hot encoding轉換之後，結果產生了七個欄位的編碼值的 DataFrame。
  
2.用Sklearn 的OneHotEncoder 編碼得到的結果一樣，OneHotEncoder 編碼後也產生了七個欄位的編碼值且直接創
  建成一個 numpy array 的形式。