In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from matplotlib import pyplot as plt
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin
from matplotlib import rcParams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from pathlib import Path
import pandas as pd
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import seaborn as sns
import warnings

In [None]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = Path('/content/drive/My Drive/OpenSource/final/data')
feature_dir = Path('/content/drive/My Drive/OpenSource/final/feature')
val_dir = Path('/content/drive/My Drive/OpenSource/final/val')
tst_dir = Path('/content/drive/My Drive/OpenSource/final/tst')
sub_dir = Path('/content/drive/My Drive/OpenSource/final/sub')


trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [None]:
algo_name = 'lgbm'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [None]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [None]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(trn['text'])
print(X_cnt.shape)

(54879, 2685)


In [None]:
vec = TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 3), min_df=50) #  stop_words=stopwords.words('english')
X = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)

(54879, 12250) (19617, 12250)


In [None]:
y = trn.author.values
y.shape

(54879,)

### Hyperparameter Tuning

In [None]:
#X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=.2, random_state=seed)

In [None]:
#params = {
#    "objective": "multiclass",
#    "n_estimators": 1000,
#    "subsample_freq": 1,
#    "random_state": seed,
#    "n_jobs": -1,
#}

#space = {
#    "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
#    "num_leaves": hp.choice("num_leaves", [15, 31, 63, 127]),
#    "colsample_bytree": hp.quniform("colsample_bytree", .5, .9, 0.1),
#    "subsample": hp.quniform("subsample", .5, .9, 0.1),
#    "min_child_samples": hp.choice('min_child_samples', [10, 25, 100])
#}

In [None]:

#def objective(hyperparams):
#    model = lgb.LGBMClassifier(**params, **hyperparams)
#    model.fit(X=X_trn, y=y_trn,
#              eval_set=[(X_val, y_val)],
#              eval_metric="multi_logloss",
#              early_stopping_rounds=10,
#              verbose=False)
#    score = model.best_score_["valid_0"]["multi_logloss"]

#    return {'loss': score, 'status': STATUS_OK, 'model': model}

#trials = Trials()
#best = fmin(fn=objective, space=space, trials=trials,
#            algo=tpe.suggest, max_evals=10, verbose=1)

#hyperparams = space_eval(space, best)
#n_best = trials.best_trial['result']['model'].best_iteration_
#params.update(hyperparams)
#print(params)

### 모델 학습

In [None]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [None]:
p_val = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))
for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    clf = lgb.LGBMClassifier(objective = 'multiclass', 
                             n_estimators = 1000, 
                             subsample_freq = 1,
                             random_state = 42,
                             n_jobs = -1,
                             colsample_bytree = 0.6000000000000001,
                             learning_rate = 0.028317262910717286,
                             min_child_samples = 10,
                             num_leaves = 127,
                             subsample = 0.7000000000000001)
    clf.fit(X[i_trn], y[i_trn],
            eval_set=[(X[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    p_val[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_tst) / n_class

[1]	valid_0's multi_logloss: 1.54809
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 1.52663
[3]	valid_0's multi_logloss: 1.50574
[4]	valid_0's multi_logloss: 1.48633
[5]	valid_0's multi_logloss: 1.46816
[6]	valid_0's multi_logloss: 1.45107
[7]	valid_0's multi_logloss: 1.43451
[8]	valid_0's multi_logloss: 1.41779
[9]	valid_0's multi_logloss: 1.40154
[10]	valid_0's multi_logloss: 1.3868
[11]	valid_0's multi_logloss: 1.37264
[12]	valid_0's multi_logloss: 1.35867
[13]	valid_0's multi_logloss: 1.34465
[14]	valid_0's multi_logloss: 1.33137
[15]	valid_0's multi_logloss: 1.31851
[16]	valid_0's multi_logloss: 1.30659
[17]	valid_0's multi_logloss: 1.2945
[18]	valid_0's multi_logloss: 1.28333
[19]	valid_0's multi_logloss: 1.27231
[20]	valid_0's multi_logloss: 1.26184
[21]	valid_0's multi_logloss: 1.25118
[22]	valid_0's multi_logloss: 1.24081
[23]	valid_0's multi_logloss: 1.2303
[24]	valid_0's multi_logloss: 1.22042
[25]	valid_0's multi_logloss: 1.211
[2

In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

Accuracy (CV):  80.5098%
Log Loss (CV):   0.5359


In [None]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [None]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0245,0.5343,0.4226,0.0164,0.0023
1,0.0201,0.9497,0.0036,0.0151,0.0114
2,0.9564,0.016,0.0203,0.0048,0.0026
3,0.004,0.0007,0.9913,0.001,0.003
4,0.9298,0.0243,0.0223,0.0152,0.0083


In [None]:
sub.to_csv(sub_file)