# 최종 코드

## 라이브러리

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import lightgbm as lgb
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, GRU, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
import warnings 
warnings.filterwarnings(action='ignore')

## 설정

In [3]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [4]:
data_dir = Path('../data/dacon-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
    os.makedirs(d, exist_ok=True)

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [5]:
algo_name = 'lgbm'
feature_name = 'stacking_v4'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

## 데이터 불러오기

In [6]:
trn = pd.read_csv(trn_file, index_col=0)
tst = pd.read_csv(tst_file, index_col=0)

## 스태킹 피처 생성

In [7]:
model_names = ['bert_n100_v2', 'cnn_emb', 'gru_glove', 'lgbm_tfidf', 'lr_tfidf', 'mta_emb', 'naiveBayse_tfidf', 'RF_tfidf', "MLPC_tfidf"]
trn = []
tst = []
feature_names = []
for model in model_names:
    trn.append(np.loadtxt(val_dir / f'{model}.val.csv', delimiter=','))
    tst.append(np.loadtxt(tst_dir / f'{model}.tst.csv', delimiter=','))
    feature_names += [f'{model}_class0', f'{model}_class1', f'{model}_class2', f'{model}_class3', f'{model}_class4']
    
trn = np.hstack(trn)
tst = np.hstack(tst)
feature_names


['bert_n100_v2_class0',
 'bert_n100_v2_class1',
 'bert_n100_v2_class2',
 'bert_n100_v2_class3',
 'bert_n100_v2_class4',
 'cnn_emb_class0',
 'cnn_emb_class1',
 'cnn_emb_class2',
 'cnn_emb_class3',
 'cnn_emb_class4',
 'gru_glove_class0',
 'gru_glove_class1',
 'gru_glove_class2',
 'gru_glove_class3',
 'gru_glove_class4',
 'lgbm_tfidf_class0',
 'lgbm_tfidf_class1',
 'lgbm_tfidf_class2',
 'lgbm_tfidf_class3',
 'lgbm_tfidf_class4',
 'lr_tfidf_class0',
 'lr_tfidf_class1',
 'lr_tfidf_class2',
 'lr_tfidf_class3',
 'lr_tfidf_class4',
 'mta_emb_class0',
 'mta_emb_class1',
 'mta_emb_class2',
 'mta_emb_class3',
 'mta_emb_class4',
 'naiveBayse_tfidf_class0',
 'naiveBayse_tfidf_class1',
 'naiveBayse_tfidf_class2',
 'naiveBayse_tfidf_class3',
 'naiveBayse_tfidf_class4',
 'RF_tfidf_class0',
 'RF_tfidf_class1',
 'RF_tfidf_class2',
 'RF_tfidf_class3',
 'RF_tfidf_class4',
 'MLPC_tfidf_class0',
 'MLPC_tfidf_class1',
 'MLPC_tfidf_class2',
 'MLPC_tfidf_class3',
 'MLPC_tfidf_class4']

In [8]:
y = pd.read_csv(trn_file, index_col=0, usecols=['index', target_col]).values.flatten()

## 모델 학습

In [9]:
### LightGBM 모델 학습

In [10]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [11]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective = 'multiclass', 
                             n_estimators = 1000, 
                             subsample_freq = 1,
                             random_state = seed,
                             n_jobs = -1,
                             colsample_bytree = 0.6000000000000001,
                             learning_rate = 0.028317262910717286,
                             min_child_samples = 10,
                             num_leaves = 127,
                             subsample = 0.7000000000000001)
    clf.fit(trn[i_trn], y[i_trn],
            eval_set=[(trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold

training model for CV #1
[1]	valid_0's multi_logloss: 1.49117
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 1.42059
[3]	valid_0's multi_logloss: 1.35727
[4]	valid_0's multi_logloss: 1.30072
[5]	valid_0's multi_logloss: 1.24939
[6]	valid_0's multi_logloss: 1.20131
[7]	valid_0's multi_logloss: 1.15789
[8]	valid_0's multi_logloss: 1.11771
[9]	valid_0's multi_logloss: 1.07989
[10]	valid_0's multi_logloss: 1.04502
[11]	valid_0's multi_logloss: 1.01214
[12]	valid_0's multi_logloss: 0.98185
[13]	valid_0's multi_logloss: 0.952652
[14]	valid_0's multi_logloss: 0.925185
[15]	valid_0's multi_logloss: 0.899705
[16]	valid_0's multi_logloss: 0.87588
[17]	valid_0's multi_logloss: 0.852994
[18]	valid_0's multi_logloss: 0.831611
[19]	valid_0's multi_logloss: 0.811415
[20]	valid_0's multi_logloss: 0.791904
[21]	valid_0's multi_logloss: 0.773687
[22]	valid_0's multi_logloss: 0.756379
[23]	valid_0's multi_logloss: 0.740182
[24]	valid_0's multi_logloss: 0.724677


[47]	valid_0's multi_logloss: 0.531671
[48]	valid_0's multi_logloss: 0.527019
[49]	valid_0's multi_logloss: 0.522292
[50]	valid_0's multi_logloss: 0.517936
[51]	valid_0's multi_logloss: 0.513827
[52]	valid_0's multi_logloss: 0.509924
[53]	valid_0's multi_logloss: 0.506229
[54]	valid_0's multi_logloss: 0.502617
[55]	valid_0's multi_logloss: 0.499088
[56]	valid_0's multi_logloss: 0.495729
[57]	valid_0's multi_logloss: 0.492329
[58]	valid_0's multi_logloss: 0.489297
[59]	valid_0's multi_logloss: 0.486331
[60]	valid_0's multi_logloss: 0.48338
[61]	valid_0's multi_logloss: 0.480674
[62]	valid_0's multi_logloss: 0.478085
[63]	valid_0's multi_logloss: 0.475614
[64]	valid_0's multi_logloss: 0.473341
[65]	valid_0's multi_logloss: 0.471083
[66]	valid_0's multi_logloss: 0.468849
[67]	valid_0's multi_logloss: 0.466827
[68]	valid_0's multi_logloss: 0.465085
[69]	valid_0's multi_logloss: 0.463098
[70]	valid_0's multi_logloss: 0.461215
[71]	valid_0's multi_logloss: 0.459575
[72]	valid_0's multi_loglo

[86]	valid_0's multi_logloss: 0.42893
[87]	valid_0's multi_logloss: 0.428027
[88]	valid_0's multi_logloss: 0.427162
[89]	valid_0's multi_logloss: 0.426373
[90]	valid_0's multi_logloss: 0.425573
[91]	valid_0's multi_logloss: 0.424817
[92]	valid_0's multi_logloss: 0.424134
[93]	valid_0's multi_logloss: 0.423536
[94]	valid_0's multi_logloss: 0.42284
[95]	valid_0's multi_logloss: 0.422328
[96]	valid_0's multi_logloss: 0.421664
[97]	valid_0's multi_logloss: 0.421193
[98]	valid_0's multi_logloss: 0.420574
[99]	valid_0's multi_logloss: 0.420031
[100]	valid_0's multi_logloss: 0.419497
[101]	valid_0's multi_logloss: 0.418997
[102]	valid_0's multi_logloss: 0.41863
[103]	valid_0's multi_logloss: 0.418282
[104]	valid_0's multi_logloss: 0.417869
[105]	valid_0's multi_logloss: 0.417554
[106]	valid_0's multi_logloss: 0.417125
[107]	valid_0's multi_logloss: 0.416762
[108]	valid_0's multi_logloss: 0.416438
[109]	valid_0's multi_logloss: 0.416055
[110]	valid_0's multi_logloss: 0.415635
[111]	valid_0's m

[129]	valid_0's multi_logloss: 0.412278
[130]	valid_0's multi_logloss: 0.412179
[131]	valid_0's multi_logloss: 0.412117
[132]	valid_0's multi_logloss: 0.411958
[133]	valid_0's multi_logloss: 0.411927
[134]	valid_0's multi_logloss: 0.411811
[135]	valid_0's multi_logloss: 0.411778
[136]	valid_0's multi_logloss: 0.411585
[137]	valid_0's multi_logloss: 0.411438
[138]	valid_0's multi_logloss: 0.411457
[139]	valid_0's multi_logloss: 0.411445
[140]	valid_0's multi_logloss: 0.411392
[141]	valid_0's multi_logloss: 0.411329
[142]	valid_0's multi_logloss: 0.41135
[143]	valid_0's multi_logloss: 0.411289
[144]	valid_0's multi_logloss: 0.411175
[145]	valid_0's multi_logloss: 0.411285
[146]	valid_0's multi_logloss: 0.411325
[147]	valid_0's multi_logloss: 0.411262
[148]	valid_0's multi_logloss: 0.411283
[149]	valid_0's multi_logloss: 0.411327
[150]	valid_0's multi_logloss: 0.411303
[151]	valid_0's multi_logloss: 0.411332
[152]	valid_0's multi_logloss: 0.41138
[153]	valid_0's multi_logloss: 0.411396
[1

In [12]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

Accuracy (CV):  85.3405%
Log Loss (CV):   0.4131


### bert_v1
Accuracy (CV):  84.9760%
Log Loss (CV):   0.4195

### bert_v2
Accuracy (CV):  85.0052%
Log Loss (CV):   0.4183

### No_bert
Accuracy (CV):  84.7756%
Log Loss (CV):   0.4247

In [13]:
#np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
# np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [14]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [15]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0119,0.5454,0.4242,0.0145,0.0039
1,0.0073,0.9764,0.0015,0.0033,0.0115
2,0.9958,0.001,0.0011,0.0011,0.001
3,0.0044,0.0054,0.9817,0.003,0.0055
4,0.9793,0.0061,0.0035,0.0069,0.0042


In [16]:
sub.to_csv(sub_file)