In [1]:
# 필요한 모듈 임포트
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
# 학습 파일 불러오기
def read_file(file_name):
    sents = []
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx + 1][0] == '$':
                this_sent = []
            elif l[0] == '$' and lines[idx - 1][0] == ';':
                continue
            elif l[0] == '\n':
                sents.append(this_sent)
            else:
                this_sent.append(tuple(l.split()))
    return sents

In [3]:
# 데이터 불러오기
corpus = read_file('./train.txt')

In [4]:
# 말뭉치 데이터에서 단어와 BIO 태그만 불러와 학습용 데이터 생성
sentences, tags = [], []
for t in corpus:
    tagged_sentence = []
    sentence, bio_tag = [], []
    for w in t:
        tagged_sentence.append((w[1], w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])
    sentences.append(sentence)
    tags.append(bio_tag)
print("샘플 크기: \n", len(sentences))
print("0번째 샘플 문장 시퀀스: \n", sentences[0])
print("0번째 샘플 bio 태그: \n", bio_tag[0])
print("샘플 문장 시퀀스 최대 길이: \n", max(len(l) for l in sentences))
print("샘플 문장 시퀀스 평균 길이: \n", (sum(map(len, sentences))/len(sentences)))

샘플 크기: 
 3555
0번째 샘플 문장 시퀀스: 
 ['한편', ',', 'AFC', '챔피언스', '리그', 'E', '조', '에', '속하', 'ㄴ', '포항', '역시', '대회', '8강', '진출', '이', '불투명', '하', '다', '.']
0번째 샘플 bio 태그: 
 O
샘플 문장 시퀀스 최대 길이: 
 168
샘플 문장 시퀀스 평균 길이: 
 34.03909985935302


In [5]:
# 토크나이저 정의
sent_tokenizer = preprocessing.text.Tokenizer(oov_token='OOV') # 첫 번째 인덱스에 OOV tkdyd
sent_tokenizer.fit_on_texts(sentences)
tag_tokenizer = preprocessing.text.Tokenizer(lower=False) # 태그 정보는 lower=Falser 소문자로 변환 X
tag_tokenizer.fit_on_texts(tags)

In [6]:
# 단어 사전 및 태그 사전 크기
vocab_size = len(sent_tokenizer.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1
print("BIO 태그 사전 크기: ", tag_size)
print("단어 사전 크기: ", vocab_size)

BIO 태그 사전 크기:  8
단어 사전 크기:  13834


In [7]:
# 학습용 단어 시퀀스 생성
x_train = sent_tokenizer.texts_to_sequences(sentences)
y_train = sent_tokenizer.texts_to_sequences(tags)
print(x_train[0])
print(y_train[0])

[183, 11, 4276, 884, 162, 931, 402, 10, 2608, 7, 1516, 608, 145, 1361, 414, 4, 6347, 2, 8, 3]
[3632, 3632, 3632, 3632, 3632, 1, 1760, 3632, 3632, 3632, 3632, 3632, 3632, 3632, 3632, 3632, 3632, 3632, 3632, 3632]


In [8]:
# index to word / index to NER 정의
index_to_word = sent_tokenizer.index_word
index_to_ner = tag_tokenizer.index_word
index_to_ner[0] = 'PAD'

In [9]:
# 시퀀스 패딩 처리
max_len = 40
x_train = preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=max_len)
y_train = preprocessing.sequence.pad_sequences(y_train, padding='post', maxlen=max_len)

In [10]:
# 학습 데이터와 테스트 데이터를 8:2 비율로 분리
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=.2, random_state=0)

In [11]:
# 출력 데이터를 원-핫 인코딩
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)
print("학습 샘플 시퀀스 형상: ", x_train.shape)
print("학습 샘플 헤이블 형상: ", y_train.shape)
print("테스트 샘플 시퀀스 형상: ", x_test.shape)
print("테스트 샘플 시퀀스 형상: ", y_test.shape)

학습 샘플 시퀀스 형상:  (2844, 40)
학습 샘플 헤이블 형상:  (2844, 40, 3633)
테스트 샘플 시퀀스 형상:  (711, 40)
테스트 샘플 시퀀스 형상:  (711, 40, 3633)


In [19]:
# 모델 정의(Bi-LSTM)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=30, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='sparse_categorical_crossentropy', 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
model.fit(x_train, y_train, batch_size=128, epochs=10)

print("평가 결과: ", model.evaluate(x_test, y_test)[1])

Epoch 1/10


ValueError: in user code:

    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:855 train_function  *
        return step_function(self, iterator)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:838 run_step  **
        outputs = model.train_step(data)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:800 train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:460 update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/utils/metrics_utils.py:86 decorated
        update_op = update_state_fn(*args, **kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py:177 update_state_fn
        return ag_update_state(*args, **kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py:664 update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py:3477 sparse_categorical_accuracy
        y_true = array_ops.squeeze(y_true, [-1])
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/util/deprecation.py:535 new_func
        return func(*args, **kwargs)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:4471 squeeze
        return gen_array_ops.squeeze(input, axis, name)
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/ops/gen_array_ops.py:10192 squeeze
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/framework/op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:599 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:3557 _create_op_internal
        ret = Operation(
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:2041 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    /Users/jin-yulim/miniforge3/envs/tf25/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:1883 _create_c_op
        raise ValueError(str(e))

    ValueError: Can not squeeze dim[2], expected a dimension of 1, got 3633 for '{{node Squeeze}} = Squeeze[T=DT_FLOAT, squeeze_dims=[-1]](IteratorGetNext:1)' with input shapes: [?,40,3633].
