In [3]:
import pickle
import lightgbm as lgb
import numpy as np
from sklearn.metrics import accuracy_score

# 바이너리 데이터를 읽어들이는 옵션
with open('processed.pickle', 'rb') as file_handle:
    vocabulary, features, labels = pickle.load(file_handle)

# 학습-테스트 데이터 나누기
total_number = len(labels)
middle_index = total_number // 2
train_features = features[:middle_index, :]
train_labels = labels[:middle_index]
test_features = features[middle_index:, :]
test_labels = labels[middle_index:]

# LightGBM 모델 학습
d_train = lgb.Dataset(train_features, label=train_labels)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1
}

# 모델 학습
model = lgb.train(params, d_train, 100)  # 100 반복 학습

In [4]:
# 학습 정확도
train_pred = model.predict(train_features)
train_pred_int = (train_pred >= 0.5).astype(int)
train_accuracy = accuracy_score(train_pred_int, train_labels)
print('train accuracy: %4.4f' % train_accuracy)

# 테스트 정확도
test_pred = model.predict(test_features)

train accuracy: 1.0000


In [5]:
test_labels[:5]

[0, 0, 0, 0, 1]

In [6]:
test_pred

array([3.14753013e-05, 6.11391423e-05, 3.33930763e-05, ...,
       2.31273016e-05, 9.45571436e-02, 7.95253768e-05])

In [7]:
(test_pred >= 0.5).astype(int)[:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [8]:
test_pred_int = (test_pred >= 0.5).astype(int)
test_accuracy = accuracy_score(test_pred_int, test_labels)
print('test accuracy: %4.4f' % test_accuracy)

# 학습에 중요한 특성 찾기
importance = model.feature_importance(importance_type='split')
feature_importance = [(importance[i], vocabulary[i]) for i in range(len(vocabulary))]
feature_importance.sort(key=lambda x: x[0], reverse=True)

# 가장 중요한 특성 20개 출력
for score, word in feature_importance[:20]:
    print('score %d word: %s' % (score, word))

test accuracy: 0.9717
score 158 word: call
score 117 word: for
score 99 word: your
score 95 word: now
score 88 word: to
score 80 word: txt
score 67 word: you
score 61 word: cost
score 54 word: me
score 53 word: chat
score 53 word: claim
score 52 word: free
score 52 word: help
score 52 word: service
score 52 word: with
score 51 word: reply
score 49 word: sms
score 49 word: text
score 46 word: or
score 46 word: uk
