In [30]:
import pickle
import lightgbm as lgb
import numpy as np
from sklearn.metrics import accuracy_score

# 바이너리 데이터를 읽어들이는 옵션
with open('processed.pickle', 'rb') as file_handle:
    vocabulary, features, labels = pickle.load(file_handle)

# 학습-테스트 데이터 나누기
total_number = len(labels)
middle_index = total_number // 2
train_features = features[:middle_index, :]
train_labels = labels[:middle_index]
test_features = features[middle_index:, :]
test_labels = labels[middle_index:]

# LightGBM 모델 학습
d_train = lgb.Dataset(train_features, label=train_labels)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1
}

# 모델 학습
model = lgb.train(params, d_train, 100)  # 100 반복 학습

In [31]:
# 학습 정확도
train_pred = model.predict(train_features)
train_pred_int = (train_pred >= 0.5).astype(int)
train_accuracy = accuracy_score(train_pred_int, train_labels)
print('train accuracy: %4.4f' % train_accuracy)

# 테스트 정확도
test_pred = model.predict(test_features)

train accuracy: 1.0000


In [15]:
test_labels[:5]

[0, 0, 0, 0, 1]

In [16]:
test_pred

array([2.30384754e-05, 4.06763161e-05, 3.15156401e-05, ...,
       2.32577518e-05, 3.24311430e-02, 8.38624126e-05])

In [17]:
(test_pred >= 0.5).astype(int)[:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [32]:
test_pred_int = (test_pred >= 0.5).astype(int)
test_accuracy = accuracy_score(test_pred_int, test_labels)
print('test accuracy: %4.4f' % test_accuracy)

# 학습에 중요한 특성 찾기
importance = model.feature_importance(importance_type='split')
feature_importance = [(importance[i], vocabulary[i]) for i in range(len(vocabulary))]
feature_importance.sort(key=lambda x: x[0], reverse=True)

# 가장 중요한 특성 20개 출력
for score, word in feature_importance[:20]:
    print('score %d word: %s' % (score, word))

test accuracy: 0.9756
score 177 word: call
score 106 word: for
score 98 word: to
score 93 word: your
score 84 word: txt
score 80 word: you
score 79 word: now
score 68 word: and
score 61 word: me
score 59 word: cost
score 54 word: free
score 54 word: service
score 53 word: chat
score 53 word: help
score 51 word: sms
score 51 word: uk
score 50 word: claim
score 48 word: reply
score 48 word: text
score 46 word: will
