In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import pickle
import torch
from gluonnlp.data import SentencepieceTokenizer
from model.net import KobertCRF
from data_utils.utils import Config
from data_utils.vocab_tokenizer import Tokenizer
from data_utils.pad_sequence import keras_pad_fn
from pathlib import Path

2022-06-02 00:50:48.284083: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/envs/ocr_new/lib/python3.8/site-packages/cv2/../../lib64:
2022-06-02 00:50:48.284123: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [2]:
class DecoderFromNamedEntitySequence():
    def __init__(self, tokenizer, index_to_ner):
        self.tokenizer = tokenizer
        self.index_to_ner = index_to_ner

    def __call__(self, list_of_input_ids, list_of_pred_ids):
        input_token = self.tokenizer.decode_token_ids(list_of_input_ids)[0]
        pred_ner_tag = [self.index_to_ner[pred_id] for pred_id in list_of_pred_ids[0]]

        # ----------------------------- parsing list_of_ner_word ----------------------------- #
        list_of_ner_word = []
        entity_word, entity_tag, prev_entity_tag = "", "", ""
        for i, pred_ner_tag_str in enumerate(pred_ner_tag):
            if "B-" in pred_ner_tag_str:
                entity_tag = pred_ner_tag_str[-3:]

                if prev_entity_tag != entity_tag and prev_entity_tag != "":
                    list_of_ner_word.append({"word": entity_word.replace("▁", " "), "tag": prev_entity_tag, "prob": None})

                entity_word = input_token[i]
                prev_entity_tag = entity_tag
            elif "I-"+entity_tag in pred_ner_tag_str:
                entity_word += input_token[i]
            else:
                if entity_word != "" and entity_tag != "":
                    list_of_ner_word.append({"word":entity_word.replace("▁", " "), "tag":entity_tag, "prob":None})
                entity_word, entity_tag, prev_entity_tag = "", "", ""


        # ----------------------------- parsing decoding_ner_sentence ----------------------------- #
        decoding_ner_sentence = ""
        is_prev_entity = False
        prev_entity_tag = ""
        is_there_B_before_I = False

        for i, (token_str, pred_ner_tag_str) in enumerate(zip(input_token, pred_ner_tag)):
            if i == 0 or i == len(pred_ner_tag)-1: # remove [CLS], [SEP]
                continue
            token_str = token_str.replace('▁', ' ')  # '▁' 토큰을 띄어쓰기로 교체

            if 'B-' in pred_ner_tag_str:
                if is_prev_entity is True:
                    decoding_ner_sentence += ':' + prev_entity_tag+ '>'

                if token_str[0] == ' ':
                    token_str = list(token_str)
                    token_str[0] = ' <'
                    token_str = ''.join(token_str)
                    decoding_ner_sentence += token_str
                else:
                    decoding_ner_sentence += '<' + token_str
                is_prev_entity = True
                prev_entity_tag = pred_ner_tag_str[-3:] # 첫번째 예측을 기준으로 하겠음
                is_there_B_before_I = True

            elif 'I-' in pred_ner_tag_str:
                decoding_ner_sentence += token_str

                if is_there_B_before_I is True: # I가 나오기전에 B가 있어야하도록 체크
                    is_prev_entity = True
            else:
                if is_prev_entity is True:
                    decoding_ner_sentence += ':' + prev_entity_tag+ '>' + token_str
                    is_prev_entity = False
                    is_there_B_before_I = False
                else:
                    decoding_ner_sentence += token_str

        return list_of_ner_word, decoding_ner_sentence

In [28]:
def input_test():
    model_dir = Path('./experiments/base_model_with_crf')
    model_config = Config(json_path=model_dir / 'config.json')

    # load vocab & tokenizer
    tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)
    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)

    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # model
    model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

    # load
    model_dict = model.state_dict()
    checkpoint_path = '/opt/ml/ocr/ner/ner_kobert/experiments/base_model_with_crf_val/best-epoch-12-step-1000-acc-0.961.bin'
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    # checkpoint = torch.load("./experiments/base_model_with_crf_val/best-epoch-12-step-1000-acc-0.960.bin", map_location=torch.device('cpu'))
    convert_keys = {}
    for k, v in checkpoint['model_state_dict'].items():
        new_key_name = k.replace("module.", '')
        if new_key_name not in model_dict:
            print("{} is not int model_dict".format(new_key_name))
            continue
        convert_keys[new_key_name] = v

    model.load_state_dict(convert_keys)
    model.eval()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner)

    while(True):
        input_text = input('input> ')
        if input_text == 'end':
            break
        
        list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
        x_input = torch.tensor(list_of_input_ids).long().to(device)
        list_of_pred_ids = model(x_input)

        list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
        print("output>", decoding_ner_sentence)
        print("")

In [30]:
input_test()

output>  <김 나 영:PER> .

output>  김 . 나 . 영.

output> 

output>  안녕하세요, <김나영:PER>입니다.

output> 

output>  안녕하세요, <김 나 영:PER> 입니다.

output> 

output>  안녕하세요저는<김나영:PER>입니다반갑습니다.

output> 

output>  반갑습니다안녕하세요저는<김나영:PER>입니다그래서<서울:LOC>시반갑습니다



KeyboardInterrupt: Interrupted by user

## Namecard Test

In [4]:
json_path = '/opt/ml/ocr/info_val.json'

In [5]:
import json

with open(json_path, 'r') as f:
    json_data = json.load(f)

anns = json_data['annotations']

In [6]:
all_words_list = []

for ann in anns:
    bboxs_per_image = ann['ocr']['word']

    words_in_image = ''
    for bbox in bboxs_per_image:
        words_in_image += bbox['text'] + ' '
    words_in_image += '.'
    all_words_list.append(words_in_image)

In [7]:
def namecard_test():
    model_dir = Path('./experiments/base_model_with_crf')
    model_config = Config(json_path=model_dir / 'config.json')

    # load vocab & tokenizer
    tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)
    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)

    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # model
    model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

    # load
    model_dict = model.state_dict()
    checkpoint_path = '/opt/ml/ocr/ner/ner_kobert/experiments/base_model_with_crf_val/best-epoch-12-step-1000-acc-0.961.bin'
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    # checkpoint = torch.load("./experiments/base_model_with_crf_val/best-epoch-12-step-1000-acc-0.960.bin", map_location=torch.device('cpu'))
    convert_keys = {}
    for k, v in checkpoint['model_state_dict'].items():
        new_key_name = k.replace("module.", '')
        if new_key_name not in model_dict:
            print("{} is not int model_dict".format(new_key_name))
            continue
        convert_keys[new_key_name] = v

    model.load_state_dict(convert_keys)
    model.eval()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner)

    tag_result = {
    'PER': [],
    'LOC': [],
    'ORG': [],
    'POH': [],
    'DAT': [],
    'TIM': [],
    'DUR': [],
    'MNY': [],
    'PNT': [],
    'NOH': []
    }

    for input_text in all_words_list:
        
        list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
        x_input = torch.tensor(list_of_input_ids).long().to(device)
        list_of_pred_ids = model(x_input)

        list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
        print("output>", decoding_ner_sentence)
        print("--------------------------------------------------------------------------------------------------------")
        print("")

        for tags in decoding_ner_sentence.replace('>', '<').split('<'):
            if ':' in tags:
                word, tag = tags.split(':')[-2:]
                if tag in tag_result.keys():
                    tag_result[tag].append(word)

    return tag_result

### NER Tagset

- 8개의 태그  
  - PER: 사람이름
  - LOC: 지명
  - ORG: 기관명
  - POH: 기타
  - DAT: 날짜
  - TIM: 시간
  - DUR: 기간
  - MNY: 통화
  - PNT: 비율
  - NOH: 기타 수량표현
- 개체의 범주
  - 개체이름: 사람이름(PER), 지명(LOC), 기관명(ORG), 기타(POH)
  - 시간표현: 날짜(DAT), 시간(TIM), 기간 (DUR)
  - 수량표현: 통화(MNY), 비율(PNT), 기타 수량표현(NOH)

In [8]:
result = namecard_test()

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


output>  내 비장의 무기는 아직 손안에 있다.그것은 희망이다 <제이준코스메틱:ORG> 주식회사 부품1팀 <서:PER>도가 <표용환:PER> 웹 <www.irisgoodtour.com:POH> 메일 <uo6ay83k[UNK]:POH><4nsys.com:POH> 유선 (<+55:POH>)626-55-845 핸드폰 (<+55:POH>)4379-51-4960 팩스 (<+55:POH>)2038-79-284 (06150) <서울특별시:LOC> <강남구:LOC> <테헤란로 329:LOC> (<역삼동:LOC>) 1215호 <830:NOH>-66-69259 <:NOH>.
--------------------------------------------------------------------------------------------------------

output>  <큐온컴퍼니:ORG> 가난은 많은 뿌리를 갖고 있습니다. 그러난 큰 뿌리는 무식이다. 비상기획관 <김대성:PER> 한국어강사 <Web:POH>) <www.tnyint.co.kr P:POH>hone) (<+386:POH>)<623-88-5891 Fax:POH>) (<+386)8111-2097-7085:POH> <Tel:POH>) (<+386:POH>)<004-17-828:POH> <Email:POH>) <5rtrpk[UNK]:POH><until.co.kr:POH> (3<5229:NOH>) <대전광역시:LOC> <서구:LOC> 둔지<로:POH> <36:NOH> <4층:NOH> <404호:NOH> <619:NOH>-08-86934 <:POH>.
--------------------------------------------------------------------------------------------------------

output>  <에스브이 컴퍼니:ORG> 고통에서 도피하지 말라. 고통의 밑바닥이 얼마나 감미로운 가를 맛보라. <최우석:PER> 기획조정실 PD H. <www.necktieh

- 8개의 태그  
  - PER: 사람이름
  - LOC: 지명
  - ORG: 기관명
  - POH: 기타
  - DAT: 날짜
  - TIM: 시간
  - DUR: 기간
  - MNY: 통화
  - PNT: 비율
  - NOH: 기타 수량표현

In [17]:
print("Num of Tags")
for tag in result.keys():
    print(f"{tag} : {len(result[tag])}")

Num of Tags
PER : 540
LOC : 1859
ORG : 387
POH : 3186
DAT : 68
TIM : 0
DUR : 34
MNY : 2
PNT : 0
NOH : 964


In [27]:
print('Example')
for tag in result.keys():
    print(f"{tag} : {result[tag][:50]}")
    # print(f"{tag} : {result[tag]}")

Example
PER : ['서', '표용환', '김대성', '최우석', '박성희', '반한달', '윤정원', '김영도', '문정원', '이진형', '홍경문', '정규진', '이선유', '김한수', '이수경', '김준규', '그로스 앤 홀', '김현일', '김은진', '크린마스터', '김상균', '홍승정', '최은아', '오경수', '김인성', '이정림', '김규한', '최광희', '오순자', '강정수', '이지윤', '양서원', '정혜진', '김준석', '블링코', '김도전', '이누리', '유재원', '김재남', '김정연', '최은정', '임동근', '이용후', '서', '김기환', '김지숙', '윤솔비', '안상현', '김정호', '송영숙']
LOC : ['서울특별시', '강남구', '테헤란로 329', '역삼동', '대전광역시', '서구', '충청남도', '공주시', '백제문화로 2148-21', '서울특별시', '강남구', '테헤란로 329 (', '삼동', '서울특별시', '강남구', '봉은사로112길 22 3층', '삼성동', '경상북도', '경주시', '외외1길', '서울특별시', '영등포구', '영신로 220 1408호', '서울특별시', '강서구', '등촌로55길 21', '서울특별시', '강남구', '테헤란로 328', '동우빌딩 10층', '서울특별시', '강남구', '선릉로 427', '서울특별시', '영등포구', '의사당대로 8 태', '서울특별', '강남구', '테헤란로 518 4층', '대치동', '서울특별시', '서대문구', '연희로 83', '은혜빌딩', '', '서울특별시', '송파구', '법원로 128 A동 408호 861-38-30081', '서울특별시', '송파구']
ORG : ['제이준코스메틱', '큐온컴퍼니', '에스브이 컴퍼니', '헤이수', 'ICT혁신', '불링누리', '새론디자인', '샵마', '혜[UNK]틀연구소', 'Email', '수린산업', '에지리', '신한국인쇄소', '스타일로직 재료연구소', '법무