In [17]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [18]:
def read_file(file_name):
    sents = []
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx + 1][0] == '$':
                this_sent = [] 
            elif l[0] == '$' and lines[idx - 1][0] == ';':
                continue
            elif l[0] == '\n':
                sents.append(this_sent)
            else: this_sent.append(tuple(l.split()))
    return sents

In [19]:
corpus = read_file('./train.txt')

In [20]:
corpus

[[('1', '한편', 'NNG', 'O'),
  ('1', ',', 'SP', 'O'),
  ('2', 'AFC', 'SL', 'O'),
  ('2', '챔피언스', 'NNG', 'O'),
  ('2', '리그', 'NNG', 'O'),
  ('3', 'E', 'SL', 'B_OG'),
  ('3', '조', 'NNG', 'I'),
  ('3', '에', 'JKB', 'O'),
  ('4', '속하', 'VV', 'O'),
  ('4', 'ㄴ', 'ETM', 'O'),
  ('5', '포항', 'NNP', 'O'),
  ('6', '역시', 'MAJ', 'O'),
  ('7', '대회', 'NNG', 'O'),
  ('8', '8강', 'NNG', 'O'),
  ('9', '진출', 'NNG', 'O'),
  ('9', '이', 'JKS', 'O'),
  ('10', '불투명', 'NNG', 'O'),
  ('10', '하', 'VV', 'O'),
  ('10', '다', 'EC', 'O'),
  ('11', '.', 'SF', 'O')],
 [('1', '2003', 'SN', 'B_DT'),
  ('1', '년', 'NNB', 'I'),
  ('2', '6', 'SN', 'I'),
  ('2', '월', 'NNB', 'I'),
  ('3', '14', 'SN', 'I'),
  ('3', '일', 'NNB', 'I'),
  ('4', '사직', 'NNG', 'O'),
  ('5', '두산', 'NNP', 'O'),
  ('5', '전', 'NNG', 'O'),
  ('6', '이후', 'NNG', 'O'),
  ('7', '박명환', 'NNP', 'B_PS'),
  ('7', '에게', 'JKB', 'O'),
  ('8', '당하', 'VV', 'O'),
  ('8', '았', 'EP', 'O'),
  ('8', '던', 'ETM', 'O'),
  ('9', '10', 'SN', 'O'),
  ('9', '연패', 'NNG', 'O'),
  ('10', 

In [21]:
sentences, tags = [], []
for t in corpus:
    tagged_sentence = []
    sentence, bio_tag = [], []
    for w in t:
        tagged_sentence.append((w[1], w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])
    sentences.append(sentence)
    tags.append(bio_tag)

print(f'sample size : {len(sentences)}')
print(f'sample sentence 0: {sentences[0]}')
print(f'sample tags 0: {tags[0]}')
print(f'max length of sample sentence : {max(len(l) for l in sentences)}')
print(f'avg length of sample sentence : {np.mean([len(l) for l in sentences])}')

sample size : 3555
sample sentence 0: ['한편', ',', 'AFC', '챔피언스', '리그', 'E', '조', '에', '속하', 'ㄴ', '포항', '역시', '대회', '8강', '진출', '이', '불투명', '하', '다', '.']
sample tags 0: ['O', 'O', 'O', 'O', 'O', 'B_OG', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
max length of sample sentence : 168
avg length of sample sentence : 34.03909985935302


In [22]:
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [23]:
word_index = defaultdict(lambda: len(word_index))
word_index['OOV'] = 0

for s in sentences:
    for w in s:
        word_index[w.lower()]

tag_index = defaultdict(lambda: len(tag_index))
tag_index['OOV'] = 0
for t in tags:
    for w in t:
        tag_index[w]

print(f'vocab size : {len(word_index)}')
print(f'tag size : {len(tag_index)}')
print(f'word index : {word_index}')
print(f'tag index : {tag_index}')

vocab size : 13833
tag size : 8
word index : defaultdict(<function <lambda> at 0x0000029147EA8D30>, {'OOV': 0, '한편': 1, ',': 2, 'afc': 3, '챔피언스': 4, '리그': 5, 'e': 6, '조': 7, '에': 8, '속하': 9, 'ㄴ': 10, '포항': 11, '역시': 12, '대회': 13, '8강': 14, '진출': 15, '이': 16, '불투명': 17, '하': 18, '다': 19, '.': 20, '2003': 21, '년': 22, '6': 23, '월': 24, '14': 25, '일': 26, '사직': 27, '두산': 28, '전': 29, '이후': 30, '박명환': 31, '에게': 32, '당하': 33, '았': 34, '던': 35, '10': 36, '연패': 37, '사슬': 38, '을': 39, '거의': 40, '5': 41, '만': 42, '끊': 43, '는': 44, '의미': 45, '있': 46, '승리': 47, '었': 48, 'ap': 49, '통신': 50, '은': 51, '8': 52, '(': 53, '이하': 54, '한국': 55, '시간': 56, ')': 57, '올라주원': 58, '유잉': 59, '비롯': 60, '아': 61, '애드리언': 62, '댄틀리': 63, '팻': 64, '라일리': 65, '감독': 66, '캐시': 67, '러시': 68, 'tv': 69, '해설가': 70, '딕': 71, '바이텔': 72, '디트로이트': 73, '피스톤스': 74, '의': 75, '구단주': 76, '윌리엄': 77, '데이비드슨': 78, '등': 79, '2008': 80, '명예': 81, '전당': 82, '헌액': 83, '자': 84, '로': 85, '결정': 86, '되': 87, '다고': 88, '보': 89, '아도': 90, '개막': 9

### Train-Test Split

In [29]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

In [25]:
x_train = [[word_index[w.lower()] for w in s] for s in sentences]
y_train = [[tag_index[t] for t in t] for t in tags]
print(f'x_train : {x_train[0]}')
print(f'y_train : {y_train[0]}')

x_train : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
y_train : [1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [26]:
index_to_word = {v: k for k, v in word_index.items()}
index_to_tag = {v: k for k, v in tag_index.items()}
index_to_tag[0] = 'PAD'

max_len = 40
x_train = pad_sequence([torch.tensor(x) for x in x_train], batch_first=True, padding_value=word_index['OOV'])
y_train = pad_sequence([torch.tensor(y) for y in y_train], batch_first=True, padding_value=tag_index['OOV'])

if x_train.size(1) > max_len:
    x_train = x_train[:, :max_len]
    y_train = y_train[:, :max_len]

print(f'x_train : {x_train[0]}')
print(f'y_train : {y_train[0]}')

x_train : tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0])
y_train : tensor([1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [27]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=.2, random_state=0)

### One-Hot Encoding

In [28]:
tag_size = len(tag_index)
y_train = F.one_hot(y_train, num_classes=tag_size).float()
y_test = F.one_hot(y_test, num_classes=tag_size).float()

print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test : {x_test.shape}')
print(f'y_test : {y_test.shape}')

x_train : torch.Size([2844, 40])
y_train : torch.Size([2844, 40, 8])
x_test : torch.Size([711, 40])
y_test : torch.Size([711, 40, 8])


In [30]:
class BiLSTMNER(nn.Module):
    def __init__(self, vocab_size, tag_size, embedding_dim, hidden_dim):
        super(BiLSTMNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True)
        self.fc = nn.Linear(hidden_dim, tag_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [31]:
model = BiLSTMNER(len(word_index), len(tag_index), 100, 128)
print(model)

BiLSTMNER(
  (embedding): Embedding(13833, 128)
  (lstm): LSTM(128, 128, bidirectional=True)
  (fc): Linear(in_features=256, out_features=8, bias=True)
)


In [32]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

EPOCHS = 10
BATCH_SIZE = 32

for epoch in range(EPOCHS):
    for i in range(0, len(x_train), BATCH_SIZE):
        x_batch = x_train[i:i + BATCH_SIZE]
        y_batch = y_train[i:i + BATCH_SIZE]

        output = model(x_batch)
        loss = criterion(output.view(-1, len(tag_index)), y_batch.view(-1, len(tag_index)))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss : {loss.item():.4f}')

Epoch 1/10, Loss : 0.0712
Epoch 2/10, Loss : 0.0508
Epoch 3/10, Loss : 0.0387
Epoch 4/10, Loss : 0.0296
Epoch 5/10, Loss : 0.0234
Epoch 6/10, Loss : 0.0198
Epoch 7/10, Loss : 0.0181
Epoch 8/10, Loss : 0.0155
Epoch 9/10, Loss : 0.0123
Epoch 10/10, Loss : 0.0079


### Test

In [36]:
model.eval()
with torch.no_grad():
    output = model(x_test)
    output = output.view(-1, len(tag_index))
    y_test = y_test.view(-1, len(tag_index))
    loss = criterion(output, y_test)
    print(f'Test Loss : {loss.item():.4f}')

Test Loss : 0.0896


In [37]:
from sklearn.metrics import f1_score, classification_report

print(classification_report(y_test.argmax(dim=1).numpy(), output.argmax(dim=1).numpy()))
print(f1_score(y_test.argmax(dim=1).numpy(), output.argmax(dim=1).numpy(), average='macro'))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8445
           1       0.93      0.94      0.94     16722
           2       0.51      0.54      0.52       625
           3       0.53      0.48      0.50      1358
           4       0.59      0.58      0.58       390
           5       0.46      0.45      0.46       455
           6       0.60      0.54      0.57       376
           7       0.84      0.46      0.60        69

    accuracy                           0.91     28440
   macro avg       0.68      0.62      0.65     28440
weighted avg       0.91      0.91      0.91     28440

0.6462836757438168
