In [5]:
import pickle
from sklearn.linear_model import LogisticRegression

# rb: 바이러리 데이터를 읽어들이는 옵션
with open('processed.pickle', 'rb') as file_handle:    
    vocabulary, features, labels = pickle.load(file_handle)

In [6]:
# 학습-테스트 데이터 나누기
# 처음 50%를 학습으로 사용하고 나머지를 평가로 사용합니다.
total_number = len(labels)
middle_index = total_number//2
train_features = features[:middle_index,:]
train_labels = labels[:middle_index]
test_features = features[middle_index:,:]
test_labels = labels[middle_index:]

In [7]:
# TF-IDF 정보가 희소행렬(sparse matrix)로 압축되어 저장
train_features[:5]

<5x8713 sparse matrix of type '<class 'numpy.float64'>'
	with 64 stored elements in Compressed Sparse Row format>

* 내부적으로 저장되는 형태이며

In [8]:
classifier = LogisticRegression() # 성능 향상을 위해 새로운 모델을 사용할 수 있다.
classifier.fit(train_features, train_labels)
print('train accuracy: %4.4f' % classifier.score(train_features, train_labels))
print('test accuracy: %4.4f' % classifier.score(test_features, test_labels))

# 어떤 항목이 판별에 영향을 많이 줬는지 찾아보기
weights = classifier.coef_[0, :]
pairs = []
for index, value in enumerate(weights):
    pairs.append( (abs(value), vocabulary[index]) )
pairs.sort(key=lambda x: x[0], reverse=True)
for pair in pairs[:20]:
    print('score %4.4f word: %s' % pair)

train accuracy: 0.9659
test accuracy: 0.9480
score 4.0001 word: txt
score 3.4521 word: call
score 3.2910 word: free
score 2.7635 word: to
score 2.5109 word: claim
score 2.4553 word: www
score 2.4490 word: stop
score 2.4278 word: uk
score 2.2604 word: text
score 2.2016 word: 150p
score 2.1835 word: service
score 2.1580 word: mobile
score 1.9421 word: my
score 1.9399 word: prize
score 1.9165 word: chat
score 1.8700 word: me
score 1.8646 word: reply
score 1.8335 word: from
score 1.8333 word: 18
score 1.8222 word: or


## 성능 향상을 위해 선택적으로 적용할 수 있는 실험
* idf 점수를 빼고 모델 학습
* 성능이 향상이 되었다면 어떤 이유로 향상이 되었을지 생각해 보자

## 생각해 봤나요?

* 아래와 같은 메일제목은?

스펨 분류에 영향을 미치는 키워드가 너무 반복적으로 사용되었을 경우 IDF가 반영하면 가중치가 떨어져서 순수 빈도로 측정했을 때보다  
성능이 떨어질 수도 있다.

# 과제
* 위 모델을 lightGBM을 적용하여 정답률을 향상시키세요

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html # 파라메터 튜닝

In [None]:
!pip install lightgbm

In [16]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import accuracy_score

In [17]:
# LightGBM 모델 학습
d_train = lgb.Dataset(train_features, label=train_labels)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1
}

# 모델 학습
model = lgb.train(params, d_train, 100)  # 100 반복 학습

In [18]:
# 학습 정확도
train_pred = model.predict(train_features)
train_pred_int = (train_pred >= 0.5).astype(int)
train_accuracy = accuracy_score(train_pred_int, train_labels)
print('train accuracy: %4.4f' % train_accuracy)

# 테스트 정확도
test_pred = model.predict(test_features)

train accuracy: 1.0000


In [19]:
test_labels[:5]

[0, 0, 0, 0, 1]

In [20]:
test_pred

array([2.30384754e-05, 4.06763161e-05, 3.15156401e-05, ...,
       2.32577518e-05, 3.24311430e-02, 8.38624126e-05])

In [27]:
(test_pred >= 0.5).astype(int)[:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [29]:
test_pred_int= (test_pred >= 0.5).astype(int)
test_accuracy = accuracy_score(test_pred_int, test_labels)
print('test accuracy: %4.4f' % test_accuracy)

test accuracy: 0.9756


In [30]:
#  학습에 중요한 특성 찾기
importance = model.feature_importance(importance_type='split')
feature_importance = [(importance[i], vocabulary[i]) for i in range(len(vocabulary))]
feature_importance.sort(key=lambda x: x[0], reverse=True)

# 가장 중요한 특성 20개 출력
for score, word in feature_importance[:20]:
    print('score %d word: %s' % (score, word))

score 177 word: call
score 106 word: for
score 98 word: to
score 93 word: your
score 84 word: txt
score 80 word: you
score 79 word: now
score 68 word: and
score 61 word: me
score 59 word: cost
score 54 word: free
score 54 word: service
score 53 word: chat
score 53 word: help
score 51 word: sms
score 51 word: uk
score 50 word: claim
score 48 word: reply
score 48 word: text
score 46 word: will
