In [1]:
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, TimeDistributed
from tensorflow.keras.optimizers import Adam

nltk.download('treebank')
nltk.download('universal_tagset')


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [2]:
# 토큰화에 품사 태깅이 된 데이터 받아오기
tagged_sentences = nltk.corpus.treebank.tagged_sents(tagset='universal')
print("품사 태깅이 된 문장 개수: ", len(tagged_sentences))

# 문장과 태그를 분리
sentences, sentence_tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(list(sentence))
    sentence_tags.append(list(tags))

# 단어와 태그에 대한 인덱스화
words = [word.lower() for sentence in sentences for word in sentence]
tags = [tag for tag_seq in sentence_tags for tag in tag_seq]

word2idx = {w: i + 2 for i, w in enumerate(list(set(words)))}
word2idx['PAD'] = 0  # 패딩용 인덱스
word2idx['OOV'] = 1  # 사전에 없는 단어(Out-Of-Vocabulary) 용 인덱스
tag2idx = {t: i + 1 for i, t in enumerate(list(set(tags)))}
tag2idx['PAD'] = 0  # 패딩용 인덱스

idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: t for t, i in tag2idx.items()}

# 문장을 정수 인덱스로 변환
X = [[word2idx.get(w.lower(), word2idx['OOV']) for w in s] for s in sentences]
y = [[tag2idx[t] for t in ts] for ts in sentence_tags]

# 패딩 추가
max_len = 100
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# 출력 값은 one-hot 인코딩
y = [to_categorical(i, num_classes=len(tag2idx)) for i in y]

# 학습 및 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


품사 태깅이 된 문장 개수:  3914


In [None]:
embedding_dim = 128
hidden_units = 64
vocab_size = len(word2idx)
tag_size = len(tag2idx)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(units=hidden_units, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

# model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0005), metrics=['accuracy'])

model.summary()



In [None]:
import tensorflow as tf

# # Eager Execution 활성화
# tf.config.run_functions_eagerly(True)  # 디버깅 시에만 활성활

# 모델 학습
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=20, validation_split=0.1, verbose=1)





Epoch 1/20




[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 355ms/step - accuracy: 0.1310 - loss: 2.1506 - val_accuracy: 0.2030 - val_loss: 0.8247
Epoch 2/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 353ms/step - accuracy: 0.2116 - loss: 0.6032 - val_accuracy: 0.2416 - val_loss: 0.2588
Epoch 3/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 349ms/step - accuracy: 0.2481 - loss: 0.1698 - val_accuracy: 0.2476 - val_loss: 0.1755
Epoch 4/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 342ms/step - accuracy: 0.2496 - loss: 0.0857 - val_accuracy: 0.2485 - val_loss: 0.1559
Epoch 5/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 351ms/step - accuracy: 0.2510 - loss: 0.0595 - val_accuracy: 0.2485 - val_loss: 0.1595
Epoch 6/20
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 441ms/step - accuracy: 0.2487 - loss: 0.0484 - val_accuracy: 0.2495 - val_loss: 0.1472
Epoch 7/20
[1m89/89[0m [32m━━━

In [5]:
def predict_sentence(sentence):
    words = sentence.split()
    x_test = pad_sequences([[word2idx.get(w.lower(), word2idx['OOV']) for w in words]], maxlen=max_len, padding='post')
    y_pred = model.predict(x_test)
    y_pred = np.argmax(y_pred, axis=-1)
    tags = [idx2tag[i] for i in y_pred[0] if i != 0]  # PAD 값은 제외
    return list(zip(words, tags))

test_sentence = "This is a simple test sentence."
print(predict_sentence(test_sentence))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
[('This', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('simple', 'ADJ'), ('test', 'NOUN'), ('sentence.', 'NOUN')]
