# Category Encoders
- https://contrib.scikit-learn.org/category_encoders/count.html
- https://towardsdatascience.com/beyond-one-hot-17-ways-of-transforming-categorical-features-into-numeric-features-57f54f199ea4

In [1]:
import numpy as np
import matplotlib.pyplot as plt

# 명령 결과 모두 보기
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
import category_encoders as ce
import pandas as pd

In [3]:
data = pd.DataFrame({
    'color':['red', 'blue', 'red', 'blue', 'red', 'red', 'blue', 'red', 'blue', 'red', 'purple'], 
    'weather':['cool', 'warm', 'cool', 'cool', 'warm', 'hotter', 'cool', 'cool', 'warm', 'cool', 'cool'], 
    'outcome':[1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1]
})
data

Unnamed: 0,color,weather,outcome
0,red,cool,1
1,blue,warm,0
2,red,cool,0
3,blue,cool,1
4,red,warm,0
5,red,hotter,1
6,blue,cool,0
7,red,cool,0
8,blue,warm,1
9,red,cool,0


## 1. Ordinal Encoder
* 범주형 출현빈도에 따라 ordinal encoding, 대신에 replace 혹은 map, apply가 적합

In [4]:
ce_ord = ce.OrdinalEncoder()
ce_ord.fit_transform(data)

Unnamed: 0,color,weather,outcome
0,1,1,1
1,2,2,0
2,1,1,0
3,2,1,1
4,1,2,0
5,1,3,1
6,2,1,0
7,1,1,0
8,2,2,1
9,1,1,0


In [6]:
df = pd.read_pickle('../data/consumer_01_34.pkl')
df.columns[df.dtypes == 'category']
df = df[df.columns[df.dtypes == 'category']].merge(df['구매'], left_index=True, right_index=True, how='left')
df.head()

Index(['성별', '지역', '직업', '학력', '주거형태', '결혼', '연령대'], dtype='object')

Unnamed: 0,성별,지역,직업,학력,주거형태,결혼,연령대,구매
0,남,서울,학생재수생,중학교재학,월세,미혼,1,1
1,남,서울,학생재수생,중학교재학,자가,미혼,1,0
2,남,서울,학생재수생,중학교재학,전세,미혼,1,1
3,남,서울,학생재수생,중학교재학,자가,미혼,1,0
4,남,서울,학생재수생,중학교재학,자가,미혼,1,0


In [7]:
ce_ord = ce.OrdinalEncoder()
ce_ord.fit_transform(df)

Unnamed: 0,성별,지역,직업,학력,주거형태,결혼,연령대,구매
0,1,1,1,1,1,1,1,1
1,1,1,1,1,2,1,1,0
2,1,1,1,1,3,1,1,1
3,1,1,1,1,2,1,1,0
4,1,1,1,1,2,1,1,0
...,...,...,...,...,...,...,...,...
3992,2,5,6,3,2,3,6,1
3993,2,5,7,6,2,2,7,1
3994,2,5,6,3,3,2,6,1
3995,2,5,6,3,2,3,7,1


## 2. CountEncoder

In [8]:
ce_count = ce.CountEncoder()
ce_count.fit_transform(data)

Unnamed: 0,color,weather,outcome
0,6,7,1
1,4,3,0
2,6,7,0
3,4,7,1
4,6,3,0
5,6,1,1
6,4,7,0
7,6,7,0
8,4,3,1
9,6,7,0


### value_count or 비율로

In [9]:
data['color'].replace(data['color'].value_counts()) # data['color'].replace(data['color'].value_counts().to_dict())
data['color'].replace(data['color'].value_counts(normalize=True))

0     6
1     4
2     6
3     4
4     6
5     6
6     4
7     6
8     4
9     6
10    1
Name: color, dtype: int64

0     0.545455
1     0.363636
2     0.545455
3     0.363636
4     0.545455
5     0.545455
6     0.363636
7     0.545455
8     0.363636
9     0.545455
10    0.090909
Name: color, dtype: float64

In [10]:
ce_count = ce.CountEncoder()
ce_count.fit_transform(df)

Unnamed: 0,성별,지역,직업,학력,주거형태,결혼,연령대,구매
0,2020,939,752,135,153,1379,464,1
1,2020,939,752,135,3134,1379,464,0
2,2020,939,752,135,685,1379,464,1
3,2020,939,752,135,3134,1379,464,0
4,2020,939,752,135,3134,1379,464,0
...,...,...,...,...,...,...,...,...
3992,1977,370,661,190,3134,133,647,1
3993,1977,370,654,1308,3134,2485,402,1
3994,1977,370,661,190,685,2485,647,1
3995,1977,370,661,190,3134,133,402,1


## 3. OneHotEncoder

In [11]:
ce_one_hot = ce.OneHotEncoder()
ce_one_hot.fit_transform(df)

Unnamed: 0,성별_1,성별_2,지역_1,지역_2,지역_3,지역_4,지역_5,지역_6,지역_7,직업_1,...,결혼_2,결혼_3,연령대_1,연령대_2,연령대_3,연령대_4,연령대_5,연령대_6,연령대_7,구매
0,1,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
3,1,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3992,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3993,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,1
3994,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,1
3995,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,1


## 4. SumEncoder
- 회귀분석의 변수의 coefficient를 모두 더하면 '0'이 되게 코딩
- BackwardDifferenceEncoder(), Helmert Coding 와 비슷

In [12]:
ce_sum = ce.SumEncoder()
ce_sum.fit_transform(df)

Unnamed: 0,intercept,성별_0,지역_0,지역_1,지역_2,지역_3,지역_4,지역_5,직업_0,직업_1,...,주거형태_3,결혼_0,결혼_1,연령대_0,연령대_1,연령대_2,연령대_3,연령대_4,연령대_5,구매
0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3992,1,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3993,1,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,...,0.0,0.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
3994,1,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3995,1,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1


## 4. TargetEncoder
### Target이 'y'이기 때문에 함수의 인자를 입력과 출력 2개로 구분

In [13]:
ce_target = ce.TargetEncoder()
ce_target.fit_transform(df.drop('구매', axis=1), df['구매'])

Unnamed: 0,성별,지역,직업,학력,주거형태,결혼,연령대
0,0.297525,0.356763,0.384309,0.385185,0.281046,0.350979,0.362069
1,0.297525,0.356763,0.384309,0.385185,0.345884,0.350979,0.362069
2,0.297525,0.356763,0.384309,0.385185,0.367883,0.350979,0.362069
3,0.297525,0.356763,0.384309,0.385185,0.345884,0.350979,0.362069
4,0.297525,0.356763,0.384309,0.385185,0.345884,0.350979,0.362069
...,...,...,...,...,...,...,...
3992,0.397572,0.337838,0.304085,0.300000,0.345884,0.353383,0.340031
3993,0.397572,0.337838,0.399083,0.328746,0.345884,0.344467,0.313433
3994,0.397572,0.337838,0.304085,0.300000,0.367883,0.344467,0.340031
3995,0.397572,0.337838,0.304085,0.300000,0.345884,0.353383,0.313433


In [14]:
df.groupby('지역')['구매'].mean()

지역
서울        0.356763
경기인천      0.281131
대전충청세종    0.382353
광주전라제주    0.337838
부산울산경남    0.557377
대구경북      0.191176
강원        0.269663
Name: 구매, dtype: float64

In [15]:
data.head()

Unnamed: 0,color,weather,outcome
0,red,cool,1
1,blue,warm,0
2,red,cool,0
3,blue,cool,1
4,red,warm,0


<font color ='brown'> Target_mean

In [16]:
y_mean = data['outcome'].mean()

y_level_mean = data['color'].replace(data.groupby('color')['outcome'].mean())

In [17]:
count_encoding = data['color'].replace(data['color'].value_counts())

In [18]:
smoothing = 1
weight = 1 /(1+ np.exp(-(count_encoding -1)/ smoothing))

In [19]:
target_encoding = y_level_mean*weight + y_mean*(1-weight)
target_encoding

0     0.334145
1     0.497844
2     0.334145
3     0.497844
4     0.334145
5     0.334145
6     0.497844
7     0.334145
8     0.497844
9     0.334145
10    0.727273
Name: color, dtype: float64

In [20]:
ce_target = ce.TargetEncoder()
ce_target.fit_transform(data['color'], data['outcome'])

Unnamed: 0,color
0,0.334145
1,0.497844
2,0.334145
3,0.497844
4,0.334145
5,0.334145
6,0.497844
7,0.334145
8,0.497844
9,0.334145


## 5. WeightOfEvidence

In [21]:
ce_leave = ce.WOEEncoder()
ce_leave.fit_transform(df, df['구매'])         

Unnamed: 0,성별,지역,직업,학력,주거형태,결혼,연령대,구매
0,-0.226623,0.043415,0.161529,0.171007,-0.293808,0.017741,0.067699,1
1,-0.226623,0.043415,0.161529,0.171007,-0.005215,0.017741,0.067699,0
2,-0.226623,0.043415,0.161529,0.171007,0.091877,0.017741,0.067699,1
3,-0.226623,0.043415,0.161529,0.171007,-0.005215,0.017741,0.067699,0
4,-0.226623,0.043415,0.161529,0.171007,-0.005215,0.017741,0.067699,0
...,...,...,...,...,...,...,...,...
3992,0.216373,-0.037517,-0.193598,-0.205865,-0.005215,0.036825,-0.029428,1
3993,0.216373,-0.037517,0.223526,-0.081145,-0.005215,-0.011365,-0.148298,1
3994,0.216373,-0.037517,-0.193598,-0.205865,0.091877,-0.011365,-0.029428,1
3995,0.216373,-0.037517,-0.193598,-0.205865,-0.005215,0.036825,-0.148298,1


In [22]:
ce_leave = ce.WOEEncoder()
ce_leave.fit_transform(data[['color', 'weather']], data['outcome']) 

Unnamed: 0,color,weather
0,-0.377294,-0.089612
1,0.133531,-0.271934
2,-0.377294,-0.089612
3,0.133531,-0.089612
4,-0.377294,-0.271934
5,-0.377294,0.0
6,0.133531,-0.089612
7,-0.377294,-0.089612
8,0.133531,-0.271934
9,-0.377294,-0.089612


# End