# 준비
라이브러리, 파라미터 세팅

In [10]:
import pandas as pd
import numpy as np
import os
import re

In [11]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TEST_SIZE = 0.2
RANDOM_SEED = 42

# 데이터 불러오기

In [12]:
# 테스트(캐글) 데이터 로드
test_data = pd.read_csv(DATA_IN_PATH + 'ko_data.csv', encoding = 'cp949')
test_data.columns = ['id','document']  # 전처리 일괄 수행을 위해 컬럼명 변경(학습 데이터셋과 동일하게)
test_data.head()

Unnamed: 0,id,document
0,0,정말 많이 울었던 영화입니다.
1,1,시간 낭비예요.
2,2,포스터를 저렇게밖에 만들지 못했던 제작자의 소심함에 침을 뱉고 싶다.
3,3,지금 봐도 재미있는 영화!!! 코믹과 감동!!! 그리고 요리!!!
4,4,이걸 영화로 만드는 거야?얼마나 가는지 보자.


In [13]:
# KoELECTRA(기본) 예측 데이터 로드
predict_koelectra = pd.read_csv(DATA_OUT_PATH + 'NSMC_KoELECTRA.csv')
predict_koelectra.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [14]:
# KoELECTRA(기본+감성사전) 예측 데이터 로드
predict_koelectra_SentiWordDict = pd.read_csv(DATA_OUT_PATH + 'NSMC_KoELECTRA_SentiWordDict.csv')
predict_koelectra_SentiWordDict.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [15]:
# KoELECTRA(기본+감성사전+한국어혐오) 예측 데이터 로드
predict_koelectra_SentiWordDict_KHS = pd.read_csv(DATA_OUT_PATH + 'NSMC_KoELECTRA_SentiWordDict_KHS.csv')
predict_koelectra_SentiWordDict_KHS.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [16]:
# KoELECTRA(기본+감성사전+네이버쇼핑) 예측 데이터 로드
predict_koelectra_SentiWordDict_NaverShopping = pd.read_csv(DATA_OUT_PATH + 'NSMC_KoELECTRA_SentiWordDict_NaverShopping.csv')
predict_koelectra_SentiWordDict_NaverShopping.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [17]:
# KoELECTRA(기본+감성사전+스팀리뷰) 예측 데이터 로드
predict_koelectra_SentiWordDict_Steam = pd.read_csv(DATA_OUT_PATH + 'NSMC_KoELECTRA_SentiWordDict_Steam.csv')
predict_koelectra_SentiWordDict_Steam.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


# 기본 모델 (Baseline)
### 모델 앙상블(Majority Voting) 방식

In [18]:
# 테스트 데이터의 id 부분을 리스트 처리
test_id = list(test_data['id'])
len(test_id)

11187

In [19]:
# koelectra 데이터의 Predicted 부분을 리스트 처리
koelectra_predicted = list(predict_koelectra['Predicted'])
len(koelectra_predicted)

11187

In [20]:
# koelectra_SentiWordDict 데이터의 Predicted 부분을 리스트 처리
koelectra_SentiWordDict_predicted = list(predict_koelectra_SentiWordDict['Predicted'])
len(koelectra_SentiWordDict_predicted)

11187

In [21]:
# koelectra_SentiWordDict_KHS 데이터의 Predicted 부분을 리스트 처리
koelectra_SentiWordDict_KHS_predicted = list(predict_koelectra_SentiWordDict_KHS['Predicted'])
len(koelectra_SentiWordDict_KHS_predicted)

11187

In [22]:
# koelectra_SentiWordDict_NaverShopping 데이터의 Predicted 부분을 리스트 처리
koelectra_SentiWordDict_NaverShopping_predicted = list(predict_koelectra_SentiWordDict_NaverShopping['Predicted'])
len(koelectra_SentiWordDict_NaverShopping_predicted)

11187

In [23]:
# koelectra_SentiWordDict_Steam 데이터의 Predicted 부분을 리스트 처리
koelectra_SentiWordDict_Steam_predicted = list(predict_koelectra_SentiWordDict_Steam['Predicted'])
len(koelectra_SentiWordDict_Steam_predicted)

11187

In [24]:
# 판다스 데이터프레임 통해 데이터 구성하여 output에 투입
result = pd.DataFrame( data={"Id": test_id, "KoELECTRA": koelectra_predicted \
                             ,"KoELECTRA_SWD": koelectra_SentiWordDict_predicted \
                             ,"KoELECTRA_SWD_KHS": koelectra_SentiWordDict_KHS_predicted \
                             ,"KoELECTRA_SWD_NaverShopping": koelectra_SentiWordDict_NaverShopping_predicted \
                             #,"KoELECTRA_SWD_Steam": koelectra_SentiWordDict_Steam_predicted \
                            } )
result

Unnamed: 0,Id,KoELECTRA,KoELECTRA_SWD,KoELECTRA_SWD_KHS,KoELECTRA_SWD_NaverShopping
0,0,1,1,1,1
1,1,0,0,0,0
2,2,1,1,0,0
3,3,1,1,1,1
4,4,0,0,0,0
...,...,...,...,...,...
11182,11182,1,1,1,1
11183,11183,0,0,0,0
11184,11184,1,1,1,1
11185,11185,0,1,1,1


In [25]:
result["Ensemble_Result"] = ((result["KoELECTRA"]+result["KoELECTRA_SWD"]+result["KoELECTRA_SWD_KHS"]+result["KoELECTRA_SWD_NaverShopping"])/3).astype('int')
#result["Ensemble_Result"] = ((result["KoELECTRA"]+result["KoELECTRA_SWD"]+result["KoELECTRA_SWD_KHS"]+result["KoELECTRA_SWD_NaverShopping"]+result["KoELECTRA_SWD_Steam"])/4).astype('int')
#result["Ensemble_Result"] = ((result["Logistic_Regression"]+result["BERT"]+result["GPT2"]+result["KoELECTRA"]+result["CNN"])/3).astype('int')
result.head(20)

Unnamed: 0,Id,KoELECTRA,KoELECTRA_SWD,KoELECTRA_SWD_KHS,KoELECTRA_SWD_NaverShopping,Ensemble_Result
0,0,1,1,1,1,1
1,1,0,0,0,0,0
2,2,1,1,0,0,0
3,3,1,1,1,1,1
4,4,0,0,0,0,0
5,5,1,1,1,1,1
6,6,0,0,0,0,0
7,7,1,1,1,1,1
8,8,0,0,0,0,0
9,9,1,1,1,1,1


In [26]:
output = result[['Id','Ensemble_Result']].copy()
output = output.rename({'Ensemble_Result':'Predicted'}, axis='columns')
output.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [27]:
# 해당 경로가 없으면 생성
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

# csv파일 생성
output.to_csv(DATA_OUT_PATH + "NSMC_Ensemble_KoELECTRA_SentiWordDict.csv", index = False)  # 앙상블 결과, 캐글 제출 용도

### 캐글 제출 결과
**[2020.12.09]**<br>
Logistic Regression, BERT, GPT2, KoELECTRA 조합(4개) => 0.89647<br>
Logistic Regression, BERT, GPT2, KoELECTRA, CNN 조합(5개) => 0.89308<br>

**[2020.12.10]**<br>
KoELECTRA (에폭3)(0.89397),<br>
KoELECTRA_NaverShopping_Steam (에폭4)(0.89629),<br>
KoELECTRA_SWD (에폭3)(0.90112)(best),<br>
KoELECTRA_SWD_KHS (에폭3)(0.89379)<br>
조합(4개) => 0.90273<br>

KoELECTRA (에폭4)(0.89772)(best),<br>
KoELECTRA_NaverShopping_Steam (에폭3)(0.89236),<br>
KoELECTRA_SWD (에폭3)(0.90112)(best),<br>
KoELECTRA_SWD_KHS (에폭4)(0.89880)(best)<br>
조합(4개) => 0.90505

**KoELECTRA (에폭4)(best),<br>
KoELECTRA_NaverShopping_Steam (에폭4)(best),<br>
KoELECTRA_SWD (에폭3)(best),<br>
KoELECTRA_SWD_KHS (에폭4)(best)<br>
조합(4개) => 0.90577**

**[2020.12.11]**<br>
KoELECTRA (에폭4)(best),<br>
KoELECTRA_SWD (에폭3)(best),<br>
KoELECTRA_SWD_KHS (에폭4)(best)<br>
KoELECTRA_SWD_NaverShopping (에폭3)<br>
KoELECTRA_SWD_Steam (에폭3)<br>
조합(5개) => 0.90184

**KoELECTRA (에폭4)(best),<br>
KoELECTRA_SWD (에폭3)(best),<br>
KoELECTRA_SWD_KHS (에폭4)(best)<br>
KoELECTRA_SWD_NaverShopping (에폭3)<br>
조합(4개) => 0.90362**

감성사전 위주로 구성한 것보다 감성사전 포함한 것과 안한 것을<br>
적절히 섞는 경우 성능이 더 좋게 나옴(캐글 기준) 

**[2020.12.12]**<br>
**KoELECTRA(에폭4)(best),<br>
KoELECTRA_NaverShopping(에폭4)(best),<br>
KoELECTRA_SentiWordDict(에폭3)(best),<br>
KoELECTRA_SentiWordDict_KHS(에폭4)(best)<br>
조합(4개) => 0.90631 (Best!!)**