In [1]:
pip install seqeval==1.0.0

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Gazetteer Model
- 3~10그램 단위로 가젯 벡터를 매겨봄
- 따라서 기존 임베딩에 5 * 8차원 벡터를 추가

In [2]:
from config import GazetteModelConfig

config = GazetteModelConfig(epoch=30, lr=0.003, dropout=0.1)
gz = config.feature_tool(0)

print(config)

{'mode': 'train', 'root_dir': '.', 'train_file': 'ner_train.txt', 'dev_file': 'ner_dev.txt', 'word_vocab_file': 'vocab/word_vocab.txt', 'tag_vocab_file': 'vocab/tag_vocab.txt', 'trained_model_name': 'epoch_5.pt', 'output_dir_path': './output', 'word_vocab_size': 2160, 'number_of_tags': 14, 'hidden_size': 200, 'dropout': 0.1, 'embedding_size': 200, 'max_length': 150, 'batch_size': 64, 'epoch': 30, 'features': ['feature.gazetteer.GazetteFeature'], 'lr': 0.003, 'gazette_feature_length': 7, 'ngrams': [2, 3, 4, 5, 6, 7, 8, 9, 10]}


In [3]:
import os, sys
import torch
import torch.nn as nn
from torchcrf import CRF
from seqeval.metrics import classification_report

class RNN_CRF(nn.Module):
    def __init__(self, config):
        super(RNN_CRF, self).__init__()

        # 전체 음절 개수
        self.eumjeol_vocab_size = config["word_vocab_size"]

        # 음절 임베딩 사이즈
        self.embedding_size = config["embedding_size"]

        # GRU 히든 사이즈
        self.hidden_size = config["hidden_size"]

        # 분류할 태그의 개수
        self.number_of_tags = config["number_of_tags"]
        
        self.gru_input_size = self.embedding_size + config["gazette_feature_length"] * len(config["ngrams"])

        # 입력 데이터에 있는 각 음절 index를 대응하는 임베딩 벡터로 치환해주기 위한 임베딩 객체
        self.embedding = nn.Embedding(num_embeddings=self.eumjeol_vocab_size,
                                      embedding_dim=self.embedding_size,
                                      padding_idx=0)
        
        self.dropout = nn.Dropout(config["dropout"])

        # Bi-GRU layer
        self.bi_gru = nn.GRU(input_size=self.gru_input_size,
                             hidden_size= self.hidden_size,
                             num_layers=2,
                             batch_first=True,
                             bidirectional=True)
        
        # CRF layer
        self.crf = CRF(num_tags=self.number_of_tags, batch_first=True)

        # fully_connected layer를 통하여 출력 크기를 number_of_tags에 맞춰줌
        # (batch_size, max_length, hidden_size*2) -> (batch_size, max_length, number_of_tags)
        self.hidden2num_tag = nn.Linear(in_features=self.hidden_size*2, out_features=self.number_of_tags)

    def forward(self, inputs, gazettes, labels=None):
        # (batch_size, max_length) -> (batch_size, max_length, embedding_size)
        eumjeol_inputs = self.embedding(inputs)
        eumjeol_gazette_inputs = torch.cat((eumjeol_inputs, gazettes), -1)
        
        encoder_outputs, hidden_states = self.bi_gru(eumjeol_gazette_inputs)
        
        # (batch_size, curr_max_length, hidden_size*2)
        d_hidden_outputs = self.dropout(encoder_outputs)

        # (batch_size, curr_max_length, hidden_size*2) -> (batch_size, curr_max_length, number_of_tags)
        logits = self.hidden2num_tag(d_hidden_outputs)

        if(labels is not None):
            log_likelihood = self.crf(emissions=logits,
                                      tags=labels,
                                      reduction="mean")

            loss = log_likelihood * -1.0

            return loss
        else:
            output = self.crf.decode(emissions=logits)

            return output

In [4]:
from preprocess.replacement import replace_digit
from tqdm import tqdm
import numpy as np


def load_vocab(f_name):
    vocab_file = open(os.path.join(config['root_dir'], f_name),'r',encoding='utf8')
    print("{} vocab file loading...".format(f_name))

    # default 요소가 저장된 딕셔너리 생성
    symbol2idx, idx2symbol = {"<PAD>":0, "<UNK>":1}, {0:"<PAD>", 1:"<UNK>"}

    # 시작 인덱스 번호 저장
    index = len(symbol2idx)
    for line in tqdm(vocab_file.readlines()):
        symbol = line.strip()
        symbol2idx[symbol] = index
        idx2symbol[index]= symbol
        index+=1

    return symbol2idx, idx2symbol

def convert_data2feature(data, symbol2idx, max_length=None):
    feature = np.zeros(shape=(max_length), dtype=np.int)
    words = data.split()

    for idx, word in enumerate(words[:max_length]):
        if word in symbol2idx.keys():
            feature[idx] = symbol2idx[word]
        else:
            feature[idx] = symbol2idx["<UNK>"]
    return feature

# 파라미터로 입력받은 파일로부터 tensor객체 생성
def load_data(config, f_name, word2idx, tag2idx):
    file = open(os.path.join(config['root_dir'], f_name),'r',encoding='utf8')

    # return할 문장/라벨 리스트 생성
    indexing_inputs, indexing_tags = [], []
    gazettes = []

    print("{} file loading...".format(f_name))

    # 실제 데이터는 아래와 같은 형태를 가짐
    # 문장 \t 태그
    # 세 종 대 왕 은 <SP> 조 선 의 <SP> 4 대 <SP> 왕 이 야 \t B_PS I_PS I_PS I_PS O <SP> B_LC I_LC O <SP> O O <SP> O O O
    for line in tqdm(file.readlines()):
        line = replace_digit(line)
        try:
            id, sentence, tags = line.strip().split('\t')
        except:
            id, sentence = line.strip().split('\t')
        input_sentence = convert_data2feature(sentence, word2idx, config["max_length"])
        input_gazette = gz.transform_end2end_complex_ngrams(sentence, config["max_length"], config["ngrams"])
        indexing_tag = convert_data2feature(tags, tag2idx, config["max_length"])

        indexing_inputs.append(input_sentence)
        gazettes.append(input_gazette)
        indexing_tags.append(indexing_tag)
        
    indexing_inputs = torch.tensor(indexing_inputs, dtype=torch.long)
    gazettes = torch.tensor(gazettes, dtype=torch.float)
    indexing_tags = torch.tensor(indexing_tags, dtype=torch.long)

    return indexing_inputs, gazettes, indexing_tags

# tensor 객체를 리스트 형으로 바꾸기 위한 함수
def tensor2list(input_tensor):
    return input_tensor.cpu().detach().numpy().tolist()

In [5]:
from torch.utils.data import (DataLoader, TensorDataset, RandomSampler)
import torch.optim as optim

def train(config):
    # 모델 객체 생성
    model = RNN_CRF(config).cuda()
    # 단어 딕셔너리 생성
    word2idx, idx2word = load_vocab(config["word_vocab_file"])
    tag2idx, idx2tag = load_vocab(config["tag_vocab_file"])

    # 데이터 Load
    train_input_features, train_gazettes, train_tags = load_data(config, config["train_file"], word2idx, tag2idx)
    test_input_features, test_gazettes, test_tags = load_data(config, config["dev_file"], word2idx, tag2idx)

    # 불러온 데이터를 TensorDataset 객체로 변환
    train_features = TensorDataset(train_input_features, train_gazettes, train_tags)
    train_dataloader = DataLoader(train_features, shuffle=True, batch_size=config["batch_size"])

    test_features = TensorDataset(test_input_features, test_gazettes, test_tags)
    test_dataloader = DataLoader(test_features, shuffle=False, batch_size=config["batch_size"])

    # 모델을 학습하기위한 optimizer
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    accuracy_list = []
    for epoch in range(config["epoch"]):
        model.train()
        losses = []
        for step, batch in enumerate(train_dataloader):
            # .cuda()를 이용하여 메모리에 업로드
            batch = tuple(t.cuda() for t in batch)
            input_features, gazettes, labels = batch

            # loss 계산
            loss = model.forward(input_features, gazettes, labels)

            # 변화도 초기화
            optimizer.zero_grad()

            # loss 값으로부터 모델 내부 각 매개변수에 대하여 gradient 계산
            loss.backward()

            # 모델 내부 각 매개변수 가중치 갱신
            optimizer.step()

            if (step + 1) % 50 == 0:
                print("{} step processed.. current loss : {}".format(step + 1, loss.data.item()))
            losses.append(loss.data.item())



        print("Average Loss : {}".format(np.mean(losses)))

        # 모델 저장
        torch.save(model.state_dict(), os.path.join(config["output_dir_path"], "epoch_{}.pt".format(epoch + 1)))

        do_test(model, test_dataloader, idx2word, idx2tag)


def test(config):
    # 모델 객체 생성
    model = RNN_CRF(config).cuda()
    # 단어 딕셔너리 생성
    word2idx, idx2word = load_vocab(config["word_vocab_file"])
    tag2idx, idx2tag = load_vocab(config["tag_vocab_file"])


    # 저장된 가중치 Load
    model.load_state_dict(torch.load(os.path.join(config["output_dir_path"], config["trained_model_name"])))

    # 데이터 Load
    test_input_features, test_gazettes, test_tags = load_data(config, config["dev_file"], word2idx, tag2idx)

    # 불러온 데이터를 TensorDataset 객체로 변환
    test_features = TensorDataset(test_input_features, test_gazettes, test_tags)
    test_dataloader = DataLoader(test_features, shuffle=False, batch_size=config["batch_size"])
    
    # 평가 함수 호출
    do_test(model, test_dataloader, idx2tag, idx2word)
    
def make_output_file(input_features, predicts, idx2word, outfile):
    with open(outfile, 'a+', encoding='utf8') as file:
        for n, input_feature in enumerate(input_features):
            if input_feature == '<PAD>':
                break
            natural_input = ' '.join([idx2word(idx) for idx in input_feature])
            
            predict = predicts[n]
            natural_predict = ' '.join(predict)
            file.write('{} \t {}\n'.format(natural_input, natural_predict))
            
    
def do_test(model, test_dataloader, idx2word, idx2tag):
    model.eval()
    predicts, answers = [], []
    for step, batch in enumerate(test_dataloader):
        # .cuda() 함수를 이용하요 메모리에 업로드
        batch = tuple(t.cuda() for t in batch)

        # 데이터를 각 변수에 저장
        input_features, gazettes, labels = batch

        # 예측 라벨 출력
        output = model(input_features, gazettes)

        # 성능 평가를 위해 예측 값과 정답 값 리스트에 저장
        for idx, answer in enumerate(tensor2list(labels)):
            answers.extend([idx2tag[e].replace("_", "-") for e in answer if idx2tag[e] != "<SP>" and idx2tag[e] != "<PAD>"])
            predicts.extend([idx2tag[e].replace("_", "-") for i, e in enumerate(output[idx]) if idx2tag[answer[i]] != "<SP>" and idx2tag[answer[i]] != "<PAD>"] )
        
        # 출력 파일에 적재
        make_output_file(input_features, predicts, idx2word, 'pred.txt')
        
    # 성능 평가
    print(classification_report(answers, predicts))

In [6]:
##########################################################
#                                                        #
#        평가 기준이 되는 지표는 Macro F1 Score          #
#           제출 포맷은 id \t predict_tag                #
#            25 \t B_PS I_PS <SP> O O O ...              #
#                                                        #
##########################################################


import os
if(__name__=="__main__"):
    output_dir = os.path.join(config['root_dir'], "output")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if(config["mode"] == "train"):
        train(config)
    else:
        test(config)


100%|██████████| 2158/2158 [00:00<00:00, 1570589.63it/s]
100%|██████████| 12/12 [00:00<00:00, 87229.89it/s]
  0%|          | 9/7319 [00:00<01:24, 86.49it/s]

vocab/word_vocab.txt vocab file loading...
vocab/tag_vocab.txt vocab file loading...
ner_train.txt file loading...


100%|██████████| 7319/7319 [01:24<00:00, 86.69it/s]
  1%|          | 9/995 [00:00<00:11, 88.54it/s]

ner_dev.txt file loading...


100%|██████████| 995/995 [00:11<00:00, 88.02it/s]


50 step processed.. current loss : 20.04500961303711
100 step processed.. current loss : 12.02828598022461
Average Loss : 29.068561811032502




              precision    recall  f1-score   support

          DT       0.67      0.54      0.60       622
          LC       0.68      0.43      0.53       535
          OG       0.67      0.31      0.42       971
          PS       0.76      0.36      0.49       739
          TI       0.50      0.34      0.40        95

   micro avg       0.68      0.39      0.50      2962
   macro avg       0.66      0.40      0.49      2962
weighted avg       0.69      0.39      0.50      2962

50 step processed.. current loss : 10.462146759033203
100 step processed.. current loss : 11.924659729003906
Average Loss : 10.214529070646867
              precision    recall  f1-score   support

          DT       0.75      0.69      0.72       622
          LC       0.77      0.53      0.63       535
          OG       0.70      0.48      0.57       971
          PS       0.63      0.66      0.64       739
          TI       0.73      0.61      0.67        95

   micro avg       0.70      0.58      0.6

              precision    recall  f1-score   support

          DT       0.80      0.75      0.78       622
          LC       0.70      0.75      0.72       535
          OG       0.73      0.65      0.69       971
          PS       0.81      0.69      0.74       739
          TI       0.78      0.74      0.76        95

   micro avg       0.76      0.70      0.73      2962
   macro avg       0.76      0.72      0.74      2962
weighted avg       0.76      0.70      0.73      2962

50 step processed.. current loss : 0.4570493698120117
100 step processed.. current loss : 0.2832145690917969
Average Loss : 0.3691930115222931
              precision    recall  f1-score   support

          DT       0.80      0.74      0.77       622
          LC       0.76      0.71      0.73       535
          OG       0.71      0.64      0.68       971
          PS       0.76      0.71      0.73       739
          TI       0.81      0.77      0.79        95

   micro avg       0.75      0.70      0.7



              precision    recall  f1-score   support

          DT       0.80      0.74      0.77       622
          LC       0.72      0.69      0.70       535
          OG       0.74      0.62      0.68       971
          PS       0.77      0.68      0.72       739
          TI       0.78      0.73      0.75        95

   micro avg       0.76      0.68      0.71      2962
   macro avg       0.76      0.69      0.72      2962
weighted avg       0.76      0.68      0.72      2962

50 step processed.. current loss : 0.47033822536468506
100 step processed.. current loss : 0.338726282119751
Average Loss : 0.5946187796800033
              precision    recall  f1-score   support

          DT       0.77      0.74      0.76       622
          LC       0.79      0.64      0.71       535
          OG       0.70      0.67      0.68       971
          PS       0.73      0.70      0.71       739
          TI       0.80      0.71      0.75        95

   micro avg       0.74      0.69      0.7