# Author: Yoonhyuck WOO / JBNU_Industrial Information system Engineering
# Date; 3. 7. 2022 - 3. 17. 2022
# Title: Korean_NER
# Professor: Seung-Hoon Na

In [2]:
import os
import json
import numpy as np
from functools import partial
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, FlaubertModel
# from transformers import BertModel
from sklearn.metrics import classification_report

In [57]:
from DataPreprocessing import Data_NER, ner_collate_fn
from modeling_ner import Bert_NER

ModuleNotFoundError: No module named 'DataPreprocessing'

In [201]:
from ner_collate_fn import collate_fn_custom

In [3]:
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
bert = BertModel.from_pretrained(PRETAINED_MODEL_NAME)
# flaubert = FlaubertModel.from_pretrained(language_model_dir, output_loading_info=True)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# BertTokenizer
 - return tensors = "pt" : Finally, you want the tokenizer to return the actual tensors that are fed to the model.
 
# BertModel
 - Attention_mask : 1 where you care and 0 where you don't care.
 - Input_ids : the IDs of the sentence morpheme.
 - Token_type_ids : for the question problem, but it's enough to set it to zero now.

reference: https://huggingface.co/docs/transformers/model_doc/bert

# Example

# Loading data

In [4]:
PATH_dir = 'C:\\Users\\LG\\Desktop\\github\\JBNU-2022-SPRING\\English world class tagging & Korean_Named Entity Recognition\\Ko_En_NER_POStag_data\Ko_NER_POS'
PATH_ko_train = os.path.join(PATH_dir, 'prepro_train.json')
PATH_ko_test = os.path.join(PATH_dir, 'prepro_test.json')
PATH_ko_dev = os.path.join(PATH_dir, 'prepro_dev.json')
total_tag = os.path.join(PATH_dir, 'total_tag.json')

In [5]:
with open(PATH_ko_train) as f: 
    dataset_train = json.load(f)
with open(PATH_ko_test) as f: 
    dataset_test = json.load(f)
with open(PATH_ko_dev) as f: 
    dataset_dev = json.load(f)
with open(total_tag, 'r') as f:
    tag_converter = json.load(f)

In [6]:
print('train', len(dataset_train))
print('test', len(dataset_test))
print('dev', len(dataset_dev))
# tag_converter.id_to_tag

train 4250
test 500
dev 250


# collate_fn
 - Tie it in batches.
 - If the data set is variable length, it cannot be tied right away and causes an error, so you have to make a 'collate_fn' and hand it over.
  - partial_collate_fn: when collating, with padding
  
  
  - reference: https://hulk89.github.io/pytorch/2019/09/30/pytorch_dataset/

# collate_fn_custom

In [120]:
def collate_fn_custom(data):
    
    input_sent = [sample[0] for sample in data]
    labels = [sample[2] for sample in data]
    
    padded_inputs = torch.nn.utils.rnn.pad_sequence(input_sent, batch_first = True)
    
    return {'input': padded_inputs.contiguous(),
            'label': torch.stack(labels).contiguous()}

# 0309

In [7]:
from ner_collate_fn import make_same_len

In [8]:
tag_to_id ={'PAD': 0, 'B-<휠': 1, 'B-OG': 2, 'I-조선': 3, 'I-PS': 4, 'B-LC': 5, 'B-1': 6, 'I-<휠': 7, 'I-LC': 8, 'B-PS': 9, 'I-TI': 10, 
            'B-목소': 11, 'O': 12, 'I-목소': 13, 'I-1': 14, 'I-': 15, 'B-조선': 16, 'I-OG': 17, 'B-': 18, 'I-DT': 19, 'B-TI': 20, 'B-DT': 21}

In [9]:
def tag_to_id2(batch):
    total = []
    convert_to = []
    for sample in batch:
        for j in sample:
            j = tag_to_id.get(j)
            convert_to.append(j)
        total.append(convert_to)
        convert_to = []
        
    return total


In [11]:
a = [['PAD'],['B-OG','I-조선'],['I-PS','PAD','I-OG','O']]
tag_to_id2(a)

[[0], [2, 3], [4, 0, 17, 12]]

In [14]:
print(make_same_len(a))
tag_to_id2(make_same_len(a))

[['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], ['PAD', 'B-OG', 'I-조선', 'PAD', 'PAD', 'PAD'], ['PAD', 'I-PS', 'PAD', 'I-OG', 'O', 'PAD']]


[[0, 0, 0, 0, 0, 0], [0, 2, 3, 0, 0, 0], [0, 4, 0, 17, 12, 0]]

In [403]:
convert_to = []
def tag_to_id(data):
    for j in data:
        j = tag_to_id.get(j)
        convert_to.append(j)
    convert_to = torch.tensor(convert_to)
        
        
    return convert_to


In [13]:
def make_same_len(batch):
    each_len_list = [len(sample) for sample in batch]
    max_len = max(each_len_list)
    
    padded_batch = []
    pad_id = 0
    special_token = 'PAD'

    for sample in batch:
        padded_batch.append(['PAD'] + sample + ['PAD'] * (max_len - len(sample)) + ['PAD'])
    
    return padded_batch


In [15]:
b = [[1],[2,2],[3,3,3],[4,4,4,4],[5,5,5,5,5],[6,6,6,6,6,6]]
make_same_len(b)

[['PAD', 1, 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'],
 ['PAD', 2, 2, 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'],
 ['PAD', 3, 3, 3, 'PAD', 'PAD', 'PAD', 'PAD'],
 ['PAD', 4, 4, 4, 4, 'PAD', 'PAD', 'PAD'],
 ['PAD', 5, 5, 5, 5, 5, 'PAD', 'PAD'],
 ['PAD', 6, 6, 6, 6, 6, 6, 'PAD']]

In [370]:
convert_to = []
def make_same_len(batch):
    each_len_list = [len(sample) for sample in batch]
    max_len = max(each_len_list)
    
    padded_batch = []
    pad_id = 0
    special_token = 'PAD'

    for sample in batch:
        padded_batch.append(['CLS'] + sample+ ['PAD'] * (max_len - len(sample)) + ['SEP'])
    
        for tag in padded_batch:
            print(tag)
            for j in tag:
                j = tag_to_id.get(j)
                convert_to.append(j)
                
        padded_batch = convert_to
#     print('before',padded_batch)
#     for tag in padded_batch:
#         print(tag)
#         print('='*90)
#         for j in tag:
# #             print(j)
#             j = tag_to_id.get(j)
#             convert_to.append(j)
        
#         padded_batch = convert_to
# #     conver_to = []
#     print('aftre')
    return padded_batch


In [374]:
a = [['PAD'],['B-OG','I-조선'],[2,2,2]]
make_same_len(a)

[['PAD', 'PAD', 'PAD', 'PAD', 'PAD'],
 ['PAD', 'B-OG', 'I-조선', 'PAD', 'PAD'],
 ['PAD', 2, 2, 2, 'PAD']]

In [16]:
def ner_fn_custom(tokenizer, data):
    
    input_sent = [sample[0] for sample in data]
    labels = [sample[2] for sample in data]

    batch_inputs = tokenizer(input_sent, padding = True, return_tensors = "pt")
    batch_labels = make_same_len(labels)
    batch_labels = tag_to_id2(batch_labels)
    batch_labels = torch.tensor(batch_labels)
    
    return batch_inputs, batch_labels
#     return {'input': padded_inputs.contiguous(),
#             'label': torch.stack(labels).contiguous()}

In [34]:
def ner_fn_custom(tokenizer, data):
    
    input_sent = [sample[0] for sample in data]
    labels = [sample[2] for sample in data]

    batch_inputs = tokenizer(input_sent, padding = True, return_tensors = "pt")
    batch_labels = make_same_len(labels)
   
    
    return batch_inputs, batch_labels
#     return {'input': padded_inputs.contiguous(),
#             'label': torch.stack(labels).contiguous()}

In [260]:
batch_size = 10

In [36]:
batch_size = 1
# partial (func, /, *args, **keywords)  positional argument: args, keyword argument: keywords

# partial_collate_fn = partial(collate_fn_custom, tokenizer, tag_converter)
# partial_collate_fn = collate_fn_custom

partial_collate_fn = partial(ner_fn_custom, tokenizer)

'''
partial_collate_fn = partial(collate_fn_custom, tokenizer, tag_converter)
'''
# ner_collate_fn: padding & making batch?
'''
def ner_collate_fn(tokenizer, tag_converter):
    
'''

'\ndef ner_collate_fn(tokenizer, tag_converter):\n    \n'

In [37]:
dataloader_train = DataLoader(
    dataset_train,
    batch_size = batch_size,
    shuffle = True,
    collate_fn = partial_collate_fn)

dataloader_test = DataLoader(
    dataset_test,
    batch_size = batch_size,
    shuffle = False,
    collate_fn = partial_collate_fn)

dataloader_dev = DataLoader(
    dataset_dev,
    batch_size = batch_size,
    shuffle = False,
    collate_fn = partial_collate_fn)

# BERT_NER
 - Dropout: method to solve overfitting, one of the problems in deep learning learning. In summary, how to prevent some units of the hidden layer from being overfitted by inoperative<br>
    ->the outputs are scaled by a factor of (1 / 1-p)  during training

# *args, **kargs
 - *args(non-keworded arguments): Tuple type of argument without limitation.
 - **kargs(keworded arguments):  Dictionary type of argument without limitation.

In [507]:
class Bert_NER(nn.Module):
    
    def __init__(self, bert):
        super().__init__()        
        self.bert = bert
        self.dorpout = nn.Dropout(p = 0.1)
        self.linear = nn.Linear(768, 22)
        self.softmax = nn.Softmax(dim = 2) # A dimension along which Softmax will be computed
        
    
    def forward(self, **kwargs):
        emb = self.bert(**kwargs)
        e = self.dropout(emb['last_hidden_state'])
        w = self.linear(e)
        
        return w

In [508]:
len(tag_to_id)

22

In [509]:
tag_num = len(tag_to_id) # 22
model = Bert_NER(22)

In [510]:
CELoss = nn.CrossEntropyLoss(ignore_index = 0)
optimizer = AdamW(model.parameters(), lr = 1.0e-3)

In [502]:
# model.cuda(6)
# device = model.bert.device
# print(device)

In [38]:
for iteration, batch in enumerate(dataloader_test):
    batch_inputs = {k: v for k, v in list (batch[0].items())}
    batch_labels = batch[1]
    
    print('before')
    print(batch[1])
    for i in batch[1]:
        for j in i:
            j = tag_to_id.get(j)
            convert_to.append(j)

    batch_labels = convert_to
    convert_to = []
#     batch_labels = tag_to_id.get(batch_labels)
    
    print('='*33,'batch_inputs','='*33)
    print(batch_inputs)
    print('='*33,'after_labels','='*33)
    print(batch_labels)

before
[['PAD', 'B-PS', 'O', 'B-DT', 'I-DT', 'O', 'B-LC', 'O', 'O', 'B-OG', 'I-OG', 'O', 'O', 'O', 'O', 'O', 'B-OG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PS', 'I-PS', 'I-PS', 'I-PS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PAD']]
{'input_ids': tensor([[   101,   9319, 119249,  20309,   9043,    129,   9641,    113,  48556,
           9485,  18784,    114,   9532,  12692,  20626,  16439,   9056,  10739,
          16985, 118936,  15001,  63218,  12605,   9590,   9637,   9612,  16605,
          86015,  24178,   9056,  48387,  12605,   8843,    122,    118,    125,
           9202,   9109,  12508,   9076,    126,   9998,   9251,   9428,  51431,
           9559,  12605, 119351,  30134,   9202,  16985,  10739,  12945,   9202,
          17655,   9359,   9633,   8865,  77884,   9322,   9529,   9056,    119,
            102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

KeyboardInterrupt: 

# START!~

# Sample

In [47]:
for epoch in range(3): # 숫자 ->train_epoch
    model.train()
    
    for iteration, batch in enumerate(dataloader_train):
        batch_inputs = {k: v for k, v in list (batch[0].items())}
        batch_labels = batch[1]

        output = model(**batch_inputs)
        print(output)
        loss = CELoss(output.view(-1, output.size(-1)), batch_labels.view(-1))

#         optimizer.zero_grad()
#         loss.backward()

#         optimizer.step()

#         if (iteration + 1) % 10 == 0:
#             print(f'{iteration:3} - loss: {loss.item()}')

#     # todo 매 에포크가 끝나면 dev 데이터로 성능 비교하기
#     # Early Stopping 적용하기

TypeError: 'int' object is not callable

# 0309_1600

In [512]:
for epoch in range(3): # 숫자 ->train_epoch
#     model.train()
    
    for iteration, batch in enumerate(dataloader_train):
        batch_inputs = {k: v for k, v in list (batch[0].items())}
        batch_labels = batch[1]  
        
        emb = bert(**batch_inputs)
        e = nn.Dropout(0.1)
        r = e(emb['last_hidden_state'])
        w = nn.Linear(768, 22)
        output = w(r)
        
#         for i in batch[1]:
#             for j in i:
#                 j = tag_to_id.get(j)
#                 convert_to.append(j)

#             batch_labels = convert_to
#             convert_to = []
#             batch_labels = torch.tensor(batch_labels)

#         batch_labels.view(-1)
#         q = nn.Softmax(dim = 2) # A dimension along which Softmax will be computed
        
        loss = CELoss(output.view(-1, output.size(-1)), batch_labels.view(-1))

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        if (iteration + 1) % 10 == 0:
            print(f'{iteration:3} - loss: {loss.item()}')

    # todo 매 에포크가 끝나면 dev 데이터로 성능 비교하기
    # Early Stopping 적용하기

KeyboardInterrupt: 

In [80]:
for epoch in range(5): # 숫자 ->train_epoch
    model.train()

    for data in dataloader_train:
        print(data['input'], data['label'])
#         loss = CELoss(data['input'].view(-1, output.size(-1)), data['label'].view(-1))

#         optimizer.zero_grad()
#         loss.backward()

#         optimizer.step()

#         if (iteration + 1) % 10 == 0:
#             print(f'{iteration:3} - loss: {loss.item()}')

    # todo 매 에포크가 끝나면 dev 데이터로 성능 비교하기
    # Early Stopping 적용하기

each_len_list [3]


ValueError: too many dimensions 'str'

# Mine

In [None]:
for epoch in range(10): # 숫자 ->train_epoch
    model.train()

    for iteration, batch in enumerate(dataloader_train):
        batch_inputs = {k: v.cuda(device) for k, v in list(batch[0].items())}
        batch_labels = batch[1].cuda(device)

        output = model(**batch_inputs)
        loss = CELoss(output.view(-1, output.size(-1)), batch_labels.view(-1))

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        if (iteration + 1) % 10 == 0:
            print(f'{iteration:3} - loss: {loss.item()}')

    # todo 매 에포크가 끝나면 dev 데이터로 성능 비교하기
    # Early Stopping 적용하기

# Sample

In [None]:
model.eval()

gold_list = []
pred_list = []

with torch.no_grad():
    for iteration, batch in enumerate(dataloader_test):
        batch_inputs = {k: v.cuda(device) for k, v in list(batch[0].items())}
        batch_labels = batch[1].cuda(device)
        
        output = model(**batch_inputs)
        loss = CELoss(output.view(-1, output.size(-1)), batch_labels.view(-1))
        
        print('loss:', loss.item())
        pred_ids = torch.argmax(output, dim=-1)
        
        for g, p in zip(batch_labels, pred_ids):
            gold_mask = g != tag_converter.pad_id
            
            gold = tag_converter.convert_ids_to_tags(g[gold_mask].tolist())
            pred = tag_converter.convert_ids_to_tags(p[gold_mask].tolist())
            gold_list.append(gold)
            pred_list.append(pred)
            
            print(gold)
            print(pred)

# Mine

In [226]:
model.eval()

gold_list = []
pred_list = []

with torch.no_grad():
    for iteration, batch in enumerate(dataloader_test):
        batch_inputs = {k: v for k, v in list(batch[0].items())}
        batch_labels = batch[1]
        
        output = model(**batch_inputs)
        loss = CELoss(output.view(-1, output.size(-1)), batch_labels.view(-1))
        
        print('loss:', loss.item())
        pred_ids = torch.argmax(output, dim=-1)
        
        for g, p in zip(batch_labels, pred_ids):
            gold_mask = g != tag_converter.pad_id
            
            gold = tag_converter.convert_ids_to_tags(g[gold_mask].tolist())
            pred = tag_converter.convert_ids_to_tags(p[gold_mask].tolist())
            gold_list.append(gold)
            pred_list.append(pred)
            
            print(gold)
            print(pred)

TypeError: train() takes 1 positional argument but 2 were given

In [None]:
gold_list_flat = []
pred_list_flat = []
for g, p in zip(gold_list, pred_list):
    gold_list_flat += g
    pred_list_flat += p

In [None]:
print(classification_report(gold_list_flat, pred_list_flat, digits=5, labels=list(tag_converter.tag_to_id.keys())[1:]))

In [None]:
def get_chunk_type(tag_name):
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type

In [None]:
def get_chunks(seq):
    default = "O"

    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        # End of a chunk + start of a chunk!
        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass

    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [None]:
def evaluate_ner_F1(total_answers, total_preds):
    num_match = num_preds = num_answers = 0

    for answers, preds in zip(total_answers, total_preds):

        answer_seg_result = set(get_chunks(answers))
        pred_seg_result = set(get_chunks(preds))

        num_match += len(answer_seg_result & pred_seg_result)
        num_answers += len(answer_seg_result)
        num_preds += len(pred_seg_result)

    precision = 100.0 * num_match / num_preds
    recall = 100.0 * num_match / num_answers
    F1 = 2 * precision * recall / (precision + recall)

    return precision, recall, F1

In [None]:
evaluate_ner_F1(gold_list, pred_list)