# 준비
라이브러리, 파라미터 세팅

In [9]:
import pandas as pd
import numpy as np
import os
import re

In [10]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TEST_SIZE = 0.2
RANDOM_SEED = 42

# 데이터 불러오기

In [11]:
# 테스트(캐글) 데이터 로드
test_data = pd.read_csv(DATA_IN_PATH + 'ko_data.csv', encoding = 'cp949')
test_data.columns = ['id','document']  # 전처리 일괄 수행을 위해 컬럼명 변경(학습 데이터셋과 동일하게)
test_data.head()

Unnamed: 0,id,document
0,0,정말 많이 울었던 영화입니다.
1,1,시간 낭비예요.
2,2,포스터를 저렇게밖에 만들지 못했던 제작자의 소심함에 침을 뱉고 싶다.
3,3,지금 봐도 재미있는 영화!!! 코믹과 감동!!! 그리고 요리!!!
4,4,이걸 영화로 만드는 거야?얼마나 가는지 보자.


In [13]:
DATA_OUT_PATH

'./data_out/'

In [16]:
# KoELECTRA 예측 데이터 로드 - 기본1
predict_koelectra_base1 = pd.read_csv(DATA_OUT_PATH + 'NSMC_Ensemble_KoELECTRA_Base_090505.csv')
predict_koelectra_base1.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [17]:
# KoELECTRA 예측 데이터 로드 - 기본2
predict_koelectra_base2 = pd.read_csv(DATA_OUT_PATH + 'NSMC_Ensemble_KoELECTRA_Base_090577.csv')
predict_koelectra_base2.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [18]:
# KoELECTRA 예측 데이터 로드 - 기본3
predict_koelectra_base3 = pd.read_csv(DATA_OUT_PATH + 'NSMC_Ensemble_KoELECTRA_Base_090631.csv')
predict_koelectra_base3.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [42]:
# KoELECTRA 예측 데이터 로드 - 기본+감성사전
predict_koelectra_SentiWordDict1 = pd.read_csv(DATA_OUT_PATH + 'NSMC_Ensemble_KoELECTRA_SentiWordDict_090184.csv')
predict_koelectra_SentiWordDict1.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [43]:
# KoELECTRA 예측 데이터 로드 - 앙상블
predict_koelectra_ensemble1 = pd.read_csv(DATA_OUT_PATH + 'NSMC_Ensemble_KoELECTRA_Ensemble_090595.csv')
predict_koelectra_ensemble1.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


# 기본 모델 (Baseline)
### 모델 앙상블(Majority Voting) 방식

In [44]:
# 테스트 데이터의 id 부분을 리스트 처리
test_id = list(test_data['id'])
len(test_id)

11187

In [45]:
# predict_koelectra_base1 데이터의 Predicted 부분을 리스트 처리
koelectra_base_predicted1 = list(predict_koelectra_base1['Predicted'])
len(koelectra_base_predicted1)

11187

In [46]:
# predict_koelectra_base2 데이터의 Predicted 부분을 리스트 처리
koelectra_base_predicted2 = list(predict_koelectra_base2['Predicted'])
len(koelectra_base_predicted2)

11187

In [47]:
# predict_koelectra_base3 데이터의 Predicted 부분을 리스트 처리
koelectra_base_predicted3 = list(predict_koelectra_base3['Predicted'])
len(koelectra_base_predicted3)

11187

In [48]:
# predict_koelectra_SentiWordDict1 데이터의 Predicted 부분을 리스트 처리
koelectra_SentiWordDict_predicted1 = list(predict_koelectra_SentiWordDict1['Predicted'])
len(koelectra_SentiWordDict_predicted1)

11187

In [49]:
# predict_koelectra_SentiWordDict1 데이터의 Predicted 부분을 리스트 처리
koelectra_ensemble_predicted1 = list(predict_koelectra_ensemble1['Predicted'])
len(koelectra_ensemble_predicted1)

11187

In [50]:
# 판다스 데이터프레임 통해 데이터 구성하여 output에 투입
result = pd.DataFrame( data={"Id": test_id, "KoELECTRA_BASE_1": koelectra_base_predicted1 \
                             ,"KoELECTRA_BASE_2": koelectra_base_predicted2 \
                             ,"KoELECTRA_BASE_3": koelectra_base_predicted3 \
                             ,"KoELECTRA_SWD_1": koelectra_SentiWordDict_predicted1 \
                             #,"KoELECTRA_ESM_1": koelectra_ensemble_predicted1 \
                            } )
result

Unnamed: 0,Id,KoELECTRA_BASE_1,KoELECTRA_BASE_2,KoELECTRA_BASE_3,KoELECTRA_SWD_1
0,0,1,1,1,1
1,1,0,0,0,0
2,2,0,0,0,0
3,3,1,1,1,1
4,4,0,0,0,0
...,...,...,...,...,...
11182,11182,1,1,1,1
11183,11183,0,0,0,0
11184,11184,1,1,1,1
11185,11185,1,1,1,1


In [52]:
result["Ensemble_Result"] = ((result["KoELECTRA_BASE_1"]+result["KoELECTRA_BASE_2"]+result["KoELECTRA_BASE_3"]+result["KoELECTRA_SWD_1"])/3).astype('int')
#result["Ensemble_Result"] = ((result["KoELECTRA_BASE_1"]+result["KoELECTRA_BASE_2"]+result["KoELECTRA_BASE_3"]+result["KoELECTRA_ESM_1"])/3).astype('int')
result.head(20)

Unnamed: 0,Id,KoELECTRA_BASE_1,KoELECTRA_BASE_2,KoELECTRA_BASE_3,KoELECTRA_SWD_1,Ensemble_Result
0,0,1,1,1,1,1
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,1,1,1,1,1
4,4,0,0,0,0,0
5,5,1,1,1,1,1
6,6,0,0,0,0,0
7,7,1,1,1,1,1
8,8,0,0,0,0,0
9,9,1,1,1,1,1


In [53]:
output = result[['Id','Ensemble_Result']].copy()
output = output.rename({'Ensemble_Result':'Predicted'}, axis='columns')
output.head()

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0


In [54]:
# 해당 경로가 없으면 생성
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

# csv파일 생성
output.to_csv(DATA_OUT_PATH + "NSMC_Ensemble_KoELECTRA_Ensemble.csv", index = False)  # 앙상블 결과, 캐글 제출 용도

### 캐글 제출 결과
**[2020.12.13~]**<br>
NSMC_Ensemble_KoELECTRA_Base_090505,<br>
NSMC_Ensemble_KoELECTRA_Base_090577,<br>
NSMC_Ensemble_KoELECTRA_Base_090631,<br>
NSMC_Ensemble_KoELECTRA_SentiWordDict_090362<br>
조합(4개) => 0.90595

NSMC_Ensemble_KoELECTRA_Base_090505,<br>
NSMC_Ensemble_KoELECTRA_Base_090577,<br>
NSMC_Ensemble_KoELECTRA_Base_090631,<br>
NSMC_Ensemble_KoELECTRA_SentiWordDict_090184<br>
조합(4개) => 0.90505

# (참고) 파일 앙상블
https://www.kaggle.com/yixinsunn/ensemble-submissions

In [138]:
"""
This script is used for competitions.
It ensembles previous .csv submissions
by averaging files, with a weight for each file.
The average method can be either arithmeitc or geometric.
"""

import os, glob, re
import numpy as np
import pandas as pd

In [170]:
path = './data_out/99_SubmissionFiles_for_Ensemble/'    # to be modified according to your directory, 
                     # e.g. set it os.getcwd() if .csv files are in current directory

# load data into a dict, <(str)filename, dataframe>
data = { f.split('/')[-1]: \
        pd.read_csv(f) for f in glob.glob(path + '/*.csv') }

# an alternative option to read data, using regular expression
#data = { re.search(r'([0-9A-Za-z._-]*?.csv)', f).group():
#        pd.read_csv(f) for f in glob.glob(path + '/*.csv') }
assert(len(data) > 0)
print('Loaded files:', list(data.keys()))

Loaded files: ['NSMC_Ensemble_KoELECTRA_Ensemble_090505.csv', 'NSMC_Ensemble_KoELECTRA_Base_090631.csv', 'NSMC_Ensemble_KoELECTRA_SentiWordDict_090184.csv', 'NSMC_Ensemble_KoELECTRA_SentiWordDict_090362.csv', 'NSMC_KoELECTRA_SentiWordDict_090112.csv', 'NSMC_Ensemble_KoELECTRA_Base_090273.csv', 'NSMC_Ensemble_KoELECTRA_Base_090505.csv', 'NSMC_Ensemble_KoELECTRA_Ensemble_090595.csv', 'NSMC_Ensemble_KoELECTRA_Base_090577.csv', 'new_submission_fn_090541.csv']


In [171]:
def ensemble(data, w, method='arithmetic'):
    """
    @params: data: a dict of dataframes, <(str)filename: dataframe>
             w: a dict of weights, <(str)filename: (int or float)weight>
             method: either arithmetic mean or geometric mean.
    @return: a new dataframe for submission
    """
    columns = data[list(data.keys())[0]].columns
    submission = pd.DataFrame({columns[0]: data[list(data.keys())[0]][columns[0]]})
    assert(method in ['arithmetic', 'geometric'])
    
    if method == 'arithmetic':
        submission[columns[1]] = 0.0
        for key in data.keys():
            submission[columns[1]] += data[key][columns[1]] * w[key]
        submission[columns[1]] /= sum(w.values())
    else:
        submission[columns[1]] = 1.0
        for key in data.keys():
            submission[columns[1]] *= data[key][columns[1]] ** w[key]
        submission[columns[1]] **= 1. / sum(w.values())
    
    return submission

In [172]:
# Enter weights here
w = { key: 1 for key in data.keys() }
for key in w:
    w[key] = float(input("Enter the weight for {}: ".format(key)))

print('\nWeights for each file:', w)

filename = './data_out/new_submission.csv'
new_submission = ensemble(data, w, method='arithmetic')
new_submission.to_csv(filename, index=False)
print('New submission file {} is now created'.format(filename))

Enter the weight for NSMC_Ensemble_KoELECTRA_Ensemble_090505.csv: 3
Enter the weight for NSMC_Ensemble_KoELECTRA_Base_090631.csv: 5
Enter the weight for NSMC_Ensemble_KoELECTRA_SentiWordDict_090184.csv: 1
Enter the weight for NSMC_Ensemble_KoELECTRA_SentiWordDict_090362.csv: 1
Enter the weight for NSMC_KoELECTRA_SentiWordDict_090112.csv: 1
Enter the weight for NSMC_Ensemble_KoELECTRA_Base_090273.csv: 1
Enter the weight for NSMC_Ensemble_KoELECTRA_Base_090505.csv: 3
Enter the weight for NSMC_Ensemble_KoELECTRA_Ensemble_090595.csv: 3
Enter the weight for NSMC_Ensemble_KoELECTRA_Base_090577.csv: 3
Enter the weight for new_submission_fn_090541.csv: 3

Weights for each file: {'NSMC_Ensemble_KoELECTRA_Ensemble_090505.csv': 3.0, 'NSMC_Ensemble_KoELECTRA_Base_090631.csv': 5.0, 'NSMC_Ensemble_KoELECTRA_SentiWordDict_090184.csv': 1.0, 'NSMC_Ensemble_KoELECTRA_SentiWordDict_090362.csv': 1.0, 'NSMC_KoELECTRA_SentiWordDict_090112.csv': 1.0, 'NSMC_Ensemble_KoELECTRA_Base_090273.csv': 1.0, 'NSMC_Ense

In [173]:
# 파일 앙상블 결과 데이터 로드
result_file_esm = pd.read_csv('./data_out/new_submission.csv')
result_file_esm.tail()

Unnamed: 0,Id,Predicted
11182,11182,1.0
11183,11183,0.0
11184,11184,1.0
11185,11185,1.0
11186,11186,0.0


In [174]:
len(result_file_esm)

11187

In [175]:
predicted_reset = []
for i in result_file_esm.iloc[:,1]:
    if i > 0.5:  i = 1
    else:  i = 0
    predicted_reset.append(i)

In [176]:
len(predicted_reset)

11187

In [177]:
result_file_esm['Predicted_New'] = predicted_reset

In [178]:
result_file_esm.drop('Predicted', axis=1, inplace=True)

In [179]:
result_file_esm.columns = ['Id','Predicted']
result_file_esm

Unnamed: 0,Id,Predicted
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0
...,...,...
11182,11182,1
11183,11183,0
11184,11184,1
11185,11185,1


In [180]:
result_file_esm.to_csv('./data_out/new_submission_fn.csv', index=False)

geometric, 최고 성능에 3 + 나머지에 1 가중치 부여 => 0.89951<br>
arithmetic, 최고 성능에 3 + 나머지에 1 가중치 부여 => 0.90541