In [1]:
########################################################
# <실습 15> 개체명(NER) 인식 모델 학습 및 모델 저장 
########################################################

In [2]:
import sys
sys.path.append('c:/chatbot')

import tensorflow as tf
from tensorflow.keras import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np
from utils.Preprocess import Preprocess

In [3]:
# 학습 파일 불러오기
def read_file(file_name):
    sents = []
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx + 1][0] == '$':
                this_sent = []
            elif l[0] == '$' and lines[idx - 1][0] == ';':
                continue
            elif l[0] == '\n':
                sents.append(this_sent)
            else:
                this_sent.append(tuple(l.split()))
    return sents

In [4]:
# 전처리 객체 생성
p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',
               userdic='../../utils/user_dic.tsv')

# 학습용 말뭉치 데이터를 불러옴
corpus = read_file('ner_train.txt')
corpus

[[('1', '가락지빵', 'NNG', 'B_FOOD'),
  ('2', '주문', 'NNP', 'O'),
  ('3', '하', 'VV', 'O'),
  ('4', '고', 'EC', 'O'),
  ('5', '싶', 'VX', 'O'),
  ('6', '어요', 'EC', 'O')],
 [('1', '가락지빵', 'NNG', 'B_FOOD'),
  ('2', '먹', 'VV', 'O'),
  ('3', '고', 'EC', 'O'),
  ('4', '싶', 'VX', 'O'),
  ('5', '어요', 'EC', 'O')],
 [('1', '가락지빵', 'NNG', 'B_FOOD'),
  ('2', '하', 'VV', 'O'),
  ('3', '고', 'EC', 'O'),
  ('4', '싶', 'VX', 'O'),
  ('5', '어요', 'EC', 'O')],
 [('1', '가락지빵', 'NNG', 'B_FOOD'),
  ('2', '원', 'NNB', 'O'),
  ('3', '하', 'XSA', 'O'),
  ('4', '어요', 'EC', 'O')],
 [('1', '가락지빵', 'NNG', 'B_FOOD'),
  ('2', '어떻', 'VA', 'O'),
  ('3', '게', 'EC', 'O'),
  ('4', '하', 'VV', 'O'),
  ('5', '아야', 'EC', 'O'),
  ('6', '되', 'VV', 'O'),
  ('7', '요', 'EF', 'O'),
  ('8', '?', 'SF', 'O')],
 [('1', '가락지빵', 'NNG', 'B_FOOD'),
  ('2', '어떻', 'VA', 'O'),
  ('3', '게', 'EC', 'O'),
  ('4', '하', 'VV', 'O'),
  ('5', '아야', 'EC', 'O'),
  ('6', '되', 'VV', 'O'),
  ('7', '어', 'EC', 'O')],
 [('1', '가락지빵', 'NNG', 'B_FOOD'),
  ('2', '하', 'VV', 

In [5]:
# 말뭉치 데이터에서 단어와 BIO 태그만 불러와 학습용 데이터셋 생성
sentences, tags = [], []
for t in corpus:
    tagged_sentence = []
    sentence, bio_tag = [], []
    for w in t:
        tagged_sentence.append((w[1], w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])

    sentences.append(sentence)
    tags.append(bio_tag)
sentences

[['가락지빵', '주문', '하', '고', '싶', '어요'],
 ['가락지빵', '먹', '고', '싶', '어요'],
 ['가락지빵', '하', '고', '싶', '어요'],
 ['가락지빵', '원', '하', '어요'],
 ['가락지빵', '어떻', '게', '하', '아야', '되', '요', '?'],
 ['가락지빵', '어떻', '게', '하', '아야', '되', '어'],
 ['가락지빵', '하', '고', '싶', '은데'],
 ['가락지빵', '알리', '어', '주', '어'],
 ['가락지빵', '가르치', '어', '주', '어'],
 ['가락지빵', '하', '고', '싶', '다'],
 ['가락지빵', '하', '고', '싶', '은데'],
 ['가락지빵', '하', 'ㄹ게요'],
 ['가락지빵', '돕', '아', '주', '어'],
 ['가락지빵', '하', 'ㄹ께', '요'],
 ['가락지빵', '도와주', '시', '어요'],
 ['가락지빵', '하', 'ㄹ', '수', '있', '나요', '?'],
 ['가락지빵', '가능', '한', '가요'],
 ['가락지빵', '문의', '드리', '어요'],
 ['가락지빵', '당일', '가능', '하', 'ㄴ', '가요'],
 ['가락지빵', '하', '아', '주', '시', '어요'],
 ['가락지빵', '하', '려구요'],
 ['가락지빵', '되', '었', '나요'],
 ['가락지빵', '되', '나요'],
 ['가락지빵', '하', 'ㄹ', '수', '있', '을까요', '?'],
 ['가락지빵', '싶', '어요'],
 ['가락지빵', '언제', '가능'],
 ['가락지빵', '언제', '가능', '하', 'ㄹ까요'],
 ['가락지빵', '하', 'ㄴ가요', '?'],
 ['가락지빵', '어떻', '게', '하', '나요'],
 ['가락지빵', '어떻', '게', '되', '나요'],
 ['가락지빵', '어떻', '게', '하', '나요'],
 ['가락지빵', '하'

In [6]:
tags

[['B_FOOD', 'O', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O'],
 ['B_FOOD', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 ['B_FOOD', 'O', 'O', 'O', 'O'],
 [

In [8]:
N = 100
print("샘플 크기 : \n", len(sentences))
print(f"{N}번째 샘플 단어 시퀀스 : \n", sentences[N])
print(f"{N}번째 샘플 bio 태그 : \n", tags[N])
print("샘플 단어 시퀀스 최대 길이 : ", max(len(l) for l in sentences))
print("샘플 단어 시퀀스 평균 길이 : ", (sum(map(len, sentences))/len(sentences)))

샘플 크기 : 
 61999
100번째 샘플 단어 시퀀스 : 
 ['가락지빵', '주문', '오케이', '?']
100번째 샘플 bio 태그 : 
 ['B_FOOD', 'O', 'O', 'O']
샘플 단어 시퀀스 최대 길이 :  168
샘플 단어 시퀀스 평균 길이 :  8.796238649010467


In [11]:
# 토크나이저 정의
tag_tokenizer = preprocessing.text.Tokenizer(lower=False) # 소문자 변환 안함
tag_tokenizer.fit_on_texts(tags)

In [12]:
tag_tokenizer.word_index

{'O': 1,
 'B_DT': 2,
 'B_FOOD': 3,
 'I': 4,
 'B_OG': 5,
 'B_PS': 6,
 'B_LC': 7,
 'NNP': 8,
 'B_TI': 9}

In [13]:
# 단어 사전 및 태그 사전 크기
vocab_size = len(p.word_index) + 1
print("단어 사전 크기 :", vocab_size)
tag_size = len(tag_tokenizer.word_index) + 1
print("BIO 태그 사전 크기 :", tag_size)  # 10

단어 사전 크기 : 17751
BIO 태그 사전 크기 : 10


In [14]:
# 학습용 단어 시퀀스 생성
x_train = [p.get_wordidx_sequence(sent) for sent in sentences]
x_train

[[1, 3, 2, 10, 11, 9],
 [1, 233, 10, 11, 9],
 [1, 2, 10, 11, 9],
 [1, 99, 2, 9],
 [1, 28, 22, 2, 50, 21, 18, 1],
 [1, 28, 22, 2, 50, 21, 12],
 [1, 2, 10, 11, 41],
 [1, 135, 12, 8, 12],
 [1, 145, 12, 8, 12],
 [1, 2, 10, 11, 42],
 [1, 2, 10, 11, 41],
 [1, 2, 57],
 [1, 227, 20, 8, 12],
 [1, 2, 69, 18],
 [1, 144, 4, 9],
 [1, 2, 16, 26, 25, 24, 1],
 [1, 6, 152, 37],
 [1, 120, 66, 9],
 [1, 95, 6, 2, 23, 37],
 [1, 2, 20, 8, 4, 9],
 [1, 2, 151],
 [1, 21, 54, 24],
 [1, 21, 24],
 [1, 2, 16, 26, 25, 113, 1],
 [1, 11, 9],
 [1, 117, 6],
 [1, 117, 6, 2, 48],
 [1, 2, 29, 1],
 [1, 28, 22, 2, 24],
 [1, 28, 22, 21, 24],
 [1, 28, 22, 2, 24],
 [1, 2, 33, 136],
 [1, 2, 102],
 [1, 2, 20, 8, 12],
 [1, 225],
 [1, 2, 50, 2, 36],
 [1, 6, 2, 48],
 [1, 224, 4, 184, 6, 2, 23, 37],
 [1, 27, 8, 49],
 [1, 95, 3],
 [1, 13, 3],
 [1, 14, 3, 2, 10, 11, 51, 1],
 [1, 14, 155, 53, 4, 115, 5, 30, 172, 74, 301, 43, 58, 51, 1, 3, 179, 66, 36],
 [1, 183, 314, 6, 2, 29, 1],
 [1, 14, 3, 2, 10, 11, 9],
 [1, 299, 303, 3],
 [1, 148,

In [15]:
y_train = tag_tokenizer.texts_to_sequences(tags)
y_train

[[3, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1],
 [3, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1],
 [3, 1, 1, 1, 1, 1, 1],
 [3, 1, 1],
 [3, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1],
 [3, 1, 1, 1, 1],
 [3, 1],
 [3, 1, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1, 1],
 [3, 1, 8, 1],
 [3, 1, 1],
 [3, 1, 1],
 [3, 1, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1],
 [3, 1, 1],
 [3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [3, 1, 1, 1, 1, 1],
 

In [17]:
index_to_ner = tag_tokenizer.index_word
index_to_ner[0] ='PAD'
index_to_ner

{1: 'O',
 2: 'B_DT',
 3: 'B_FOOD',
 4: 'I',
 5: 'B_OG',
 6: 'B_PS',
 7: 'B_LC',
 8: 'NNP',
 9: 'B_TI',
 0: 'PAD'}

In [18]:
# 시퀀스 패딩 처리
max_len = 40
x_train = preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=max_len)
y_train = preprocessing.sequence.pad_sequences(y_train, padding='post', maxlen=max_len)
x_train.shape, x_train

((61999, 40),
 array([[  1,   3,   2, ...,   0,   0,   0],
        [  1, 233,  10, ...,   0,   0,   0],
        [  1,   2,  10, ...,   0,   0,   0],
        ...,
        [  1,   1,   1, ...,   0,   0,   0],
        [  1,   1,   1, ...,   0,   0,   0],
        [  1,   1, 360, ...,   0,   0,   0]]))

In [19]:
y_train.shape, y_train

((61999, 40),
 array([[3, 1, 1, ..., 0, 0, 0],
        [3, 1, 1, ..., 0, 0, 0],
        [3, 1, 1, ..., 0, 0, 0],
        ...,
        [2, 2, 3, ..., 0, 0, 0],
        [2, 2, 3, ..., 0, 0, 0],
        [2, 2, 3, ..., 0, 0, 0]]))

In [20]:
# 학습데이터와 테스트 데이터를 8:2 비율로 분리
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=.2,
                                                    random_state=1234)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((49599, 40), (12400, 40), (49599, 40), (12400, 40))

In [21]:
## 출력 데이터 원-핫 인코딩
y_train = tf.keras.utils.to_categorical(y_train, num_classes=tag_size)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=tag_size)
y_train.shape, y_train

((49599, 40, 10),
 array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.],
         ...,
         [1., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.],
         ...,
         [1., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 1., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         ...,
         [1., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.],
         [0., 1., 0., ..., 0., 0., 0.],
         ...,
         [1., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., 

In [22]:
y_test.shape

(12400, 40, 10)

In [24]:
print("학습 샘플 시퀀스 형상 : ", x_train.shape)
print("학습 샘플 레이블 형상(원핫) : ", y_train.shape)
print("테스트 샘플 시퀀스 형상 : ", x_test.shape)
print("테스트 샘플 레이블 형상(원핫) : ", y_test.shape)

학습 샘플 시퀀스 형상 :  (49599, 40)
학습 샘플 레이블 형상(원핫) :  (49599, 40, 10)
테스트 샘플 시퀀스 형상 :  (12400, 40)
테스트 샘플 레이블 형상(원핫) :  (12400, 40, 10)


In [25]:
## 모델 정의
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

In [26]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=30, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.01), metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 30)            532530    
_________________________________________________________________
bidirectional (Bidirectional (None, 40, 400)           369600    
_________________________________________________________________
time_distributed (TimeDistri (None, 40, 10)            4010      
Total params: 906,140
Trainable params: 906,140
Non-trainable params: 0
_________________________________________________________________


In [27]:
%%time
# 388/388 [==============================] - 136s 352ms/step - loss: 0.0027 - accuracy: 0.9957
# CPU times: total: 2h 4min 46s
# Wall time: 23min 55s
model.fit(x_train, y_train, batch_size=128, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: total: 2h 6min 42s
Wall time: 27min 14s


<tensorflow.python.keras.callbacks.History at 0x27008c3edc0>

In [28]:
print("평가 결과 : ", model.evaluate(x_test, y_test)[1])
model.save('ner_model.h5')

평가 결과 :  0.9863013625144958


In [29]:
# 시퀀스를 NER 태그로 변환
def sequences_to_tag(sequences):  # 예측값을 index_to_ner을 사용하여 태깅정보로 변경하는 함수
    result = []
    for sequence in sequences:  # 전체 시퀀스로부터 시퀀스를 하나 꺼낸다.
        temp = []
        for pred in sequence:   # 시퀀스로부터 예측값을 하나씩 꺼낸다.
            pred_index = np.argmax(pred)
            temp.append(index_to_ner[pred_index].replace("PAD", "O")) # 'PAD'는 'O'로 변경
        result.append(temp)
    return result

In [30]:
# F1 스코어 계산을 위해 사용
from seqeval.metrics import f1_score, classification_report

# 테스트 데이터셋의 NER 예측
y_predicted = model.predict(x_test)
y_predicted.shape, y_predicted

((12400, 40, 10),
 array([[[6.74847115e-07, 2.44787754e-03, 1.15410576e-05, ...,
          8.63708496e-01, 1.67204362e-07, 5.75588963e-08],
         [2.85335915e-08, 9.56277311e-01, 1.61579665e-05, ...,
          1.35823757e-05, 3.16858262e-07, 9.72309806e-08],
         [6.42953850e-08, 9.98359263e-01, 6.89022181e-06, ...,
          1.54132940e-05, 1.17588954e-07, 6.08342702e-07],
         ...,
         [3.08556309e-08, 9.99968171e-01, 1.82839121e-05, ...,
          2.78052767e-06, 1.74631332e-08, 6.93520235e-07],
         [1.99969818e-08, 9.99940395e-01, 1.21227649e-05, ...,
          7.07269737e-06, 3.62432999e-08, 1.99298995e-07],
         [9.60663855e-02, 1.49774432e-01, 9.25184786e-02, ...,
          9.35657173e-02, 9.20851380e-02, 9.14699361e-02]],
 
        [[2.34287290e-06, 9.99172211e-01, 1.23490699e-05, ...,
          8.56863480e-05, 8.52910148e-07, 2.30052865e-06],
         [3.96092794e-08, 9.99948859e-01, 2.77551044e-05, ...,
          9.63516231e-06, 2.03419912e-08, 2.3237

In [31]:
pred_tags = sequences_to_tag(y_predicted)  # 예측된 NER
test_tags = sequences_to_tag(y_test)       # 실제 NER
print(pred_tags)  # 예측

[['B_LC', 'O', 'O', 'O', 'B_DT', 'I', 'I', 'B_TI', 'I', 'I', 'I', 'I', 'B_LC', 'I', 'I', 'B_LC', 'O', 'B_LC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_DT', 'I', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B_DT', 'B_DT', 'B_DT', 'B_DT', 'B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B_FOOD', 'O', 'NNP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [32]:
print(test_tags) # 실제

[['B_OG', 'I', 'I', 'O', 'B_DT', 'I', 'I', 'B_TI', 'I', 'I', 'I', 'I', 'B_LC', 'I', 'I', 'B_LC', 'B_LC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_DT', 'I', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B_OG', 'O', 'B_OG', 'O', 'O', 'O', 'O', 'B_OG', 'O', 'O', 'O', 'O', 'O', 'O', 'B_OG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B_DT', 'B_DT', 'B_DT', 'B_DT', 'B_FOOD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B_FOOD', 'O', 'NNP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [33]:
# F1 평가 결과
print(classification_report(test_tags, pred_tags))
print("F1-score: {:.1%}".format(f1_score(test_tags, pred_tags)))



              precision    recall  f1-score   support

          NP       1.00      1.00      1.00       303
           _       0.55      0.55      0.55       658
         _DT       1.00      1.00      1.00     13683
       _FOOD       1.00      1.00      1.00     11655
         _LC       0.76      0.56      0.65       314
         _OG       0.49      0.60      0.54       460
         _PS       0.73      0.50      0.60       396
         _TI       0.79      0.72      0.75        61

   micro avg       0.97      0.97      0.97     27530
   macro avg       0.79      0.74      0.76     27530
weighted avg       0.97      0.97      0.97     27530

F1-score: 97.0%
