### 개체명 인식 (Named-Entity-Recognition, NER)

In [1]:
'''
개체명 인식(NER)은 NLP 분야 중 하나이며, 사람, 장소 등과 같이 이름을 가진 개체 자체를 인식하는 작업입니다.
이번 글에서는 다중 언어에 대한 개체명 인식을 다루고자 합니다. 

이때 주로 사용하는 언어 모델은 바로 XLM-RoBERTa(XLM-R) 모델이며, XLM-R은 cross-lingual일 때 주로 사용합니다.

xtreme 데이터셋은 독일어, 프랑스어, 이탈리아어 그리고 영어로 총 4개의 언어로 구성되어 있습니다.

LOC는 위치에 대한 개체명을 의미하고, PER는 사람, ORG는 조직, B-는 개체명의 시작, I는 접두사, O는 어떤 개체명에도 속하지 않는 것을 의미합니다.
'''

from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names('xtreme')

from datasets import load_dataset 

load_dataset('xtreme', name='PAN-X.de')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset xtreme (/root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)
100%|██████████| 3/3 [00:00<00:00, 856.45it/s]


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [2]:
from collections import defaultdict 
from datasets import DatasetDict 

# 데이터를 샘플링하는 단계 입니다. 

langs = ['de', 'fr', 'it', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}')

    for split in ds:
        panx_ch[lang][split] = (ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows))))

Found cached dataset xtreme (/root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)
100%|██████████| 3/3 [00:00<00:00, 1068.16it/s]
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-ffa5433ed3c9724a.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-4039083f0558eec0.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-10c85988554a1179.arrow
Found cached dataset xtreme (/root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)
100%|██████████| 3/3 [00:00<00:00, 764.97it/s]
Loading cach

In [3]:
import pandas as pd 

pd.DataFrame({lang: [panx_ch[lang]['train'].num_rows] for lang in langs}, index=['Numbert of training examples'])

Unnamed: 0,de,fr,it,en
Numbert of training examples,12580,4580,1680,1180


In [4]:
panx_ch['de']['train'][1]

{'tokens': ['Sie',
  'geht',
  'hinter',
  'Walluf',
  'nahtlos',
  'in',
  'die',
  'Bundesautobahn',
  '66',
  'über',
  '.'],
 'ner_tags': [0, 0, 0, 3, 0, 0, 0, 3, 4, 0, 0],
 'langs': ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']}

In [5]:
'''
panx_ch[de][train]: 독일어(de)에서 학습 데이터에 존재하는 첫 번째 데이터를 불러온다는 것을 의미합니다.
ner_tags는 [O, B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC] 로 구성되어 있으며, 각각 index는 0~6의 값을 가집니다.
langs는 언어가 어떤 언어인지 의미합니다. 
'''

element = panx_ch['de']['train'][0]
for key, value in element.items():
    print(f'{key}: {value}')

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [6]:
for key, value in panx_ch['de']['train'].features.items():
    print(f'{key}: {value}')

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [7]:
tags = panx_ch['de']['train'].features['ner_tags'].feature 
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [8]:
def create_tag_names(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tag_names)

Loading cached processed dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-35865684f50f4704.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-16ce8618502a8695.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-13c6a081b8ccf94a.arrow


In [9]:
de_example = panx_de['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']], ['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [10]:
# LOC, ORG, PER가 데이터셋에 각각 얼마만큼 들어가 있는지 확인하는 코드 입니다. 

from collections import Counter 

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                tag_type = tag.split('-')[1]
                split2freqs[split][tag_type] += 1 
pd.DataFrame.from_dict(split2freqs, orient='index')

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


### XLM-R 토큰화

In [11]:
from transformers import AutoTokenizer 

'''
감성 분석과 같은 모델을 사용할 때에는 bert-base-uncased를 사용하였으나 개체명 인식과 같은 경우에는 대문자가 중요하기 때문에
bert-base-cased를 사용합니다. cased로 사용하게 되면 단어의 대소문자를 구분해줍니다.
'''

bert_name_or_path = 'bert-base-cased'
xlmr_name_or_path = 'xlm-roberta-base'

bert_tokenizer = AutoTokenizer.from_pretrained(bert_name_or_path)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_name_or_path)

In [12]:
text = 'Jack Sparrow loves New York!'
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [13]:
bert_tokens

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']

In [14]:
# BERT 모델과 가장 큰 차이점은 바로 문장의 시작과 끝을 [CLS], [SEP]가 아닌 <s>, </s>로 끝난다는 것입니다.
# 추가적으로 ##로 표시하는 것이 아니라 _로 표시가 됩니다.
xlmr_tokens

['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']

### SentencePiece 토크나이저

In [15]:
# u'\u2581'는 _를 의미합니다. 
"".join(xlmr_tokens).replace(u'\u2581', ' ')

'<s> Jack Sparrow loves New York!</s>'

In [16]:
import torch.nn as nn 
from transformers import XLMRobertaConfig 
from transformers.modeling_outputs import TokenClassifierOutput 
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

'''
감성 분석의 경우 [CLS] 토큰을 이용하여 긍정인지 혹은 부정인지 분류했습니다.
감성 분석은 모든 문장을 고려한 후 하나의 값만 도출하면 되기 때문에 그렇게 진행하였으나, 
NER 같은 경우에는 각 토큰에 대한 정보를 출력하여야 합니다. 
'''

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig 

    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels 
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_outputs = self.dropout(outputs[0])
        logits = self.classifier(sequence_outputs)

        loss = None 
        if labels is not None:
            criterion = nn.CrossEntropyLoss()
            loss = criterion(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [17]:
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [18]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [19]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_name_or_path, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)

In [20]:
import torch 

device = 'cuda' if torch.cuda.is_available() else 'cpu'

xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_name_or_path, config=xlmr_config).to(device))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifie

In [21]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')

pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=['Tokens', 'Input IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [22]:
outputs = xlmr_model(input_ids.to(device)).logits
outputs = torch.argmax(outputs, dim=-1)

In [23]:
pred_y = [tags.names[p] for p in outputs[0].detach().cpu().numpy()]
pd.DataFrame([xlmr_tokens, pred_y], index=['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,I-LOC,I-LOC,I-LOC,O,B-ORG,B-ORG,I-LOC,B-PER,I-LOC,I-LOC


In [40]:
def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()

    input_ids = xlmr_tokenizer(text, return_tensors='pt').input_ids.to(device)
    outputs = model(input_ids)[0]

    pred_y = torch.argmax(outputs, dim=-1)

    pred_y = [tags.names[p] for p in pred_y[0].detach().cpu().numpy()]
    return pd.DataFrame([tokens, pred_y], index=['Tokens', 'Tags'])

In [25]:
words, labels = de_example['tokens'], de_example['ner_tags']

In [26]:
tokenized_input = xlmr_tokenizer(words, is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
pd.DataFrame([tokens], index=['Tokens'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [27]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=['Tokens', 'Word IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [28]:
previous_word_idx = None
label_ids = []

'''
-100으로 설정하는 이유는 torch.nn.CrossEntropyLoss의 ignore_index 속성 값이 -100이기 때문입니다.
학습할 때 해당 토큰은 학습을 하지 않습니다.
'''
for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    
    previous_word_idx = word_idx 

labels = [index2tag[l] if l != -100 else 'IGN' for l in label_ids]
index = ['Tokens', 'Word IDs', 'Label IDs', 'Labels']

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [29]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for idx, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None 
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)

            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx 

        labels.append(label_ids)
    tokenized_inputs['labels'] = labels 
    return tokenized_inputs 

In [30]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True, remove_columns=['langs', 'ner_tags', 'tokens'])

In [31]:
panx_de_encoded = encode_panx_dataset(panx_ch['de'])

Loading cached processed dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-dc2e1a689c46e1e7.arrow
100%|██████████| 7/7 [00:00<00:00, 28.10ba/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-147b5e23b5b64a2e.arrow


In [32]:
from seqeval.metrics import classification_report

y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', '0'], 
          ['B-PER', 'I-PER', 'O']]

y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], 
          ['B-PER', 'I-PER', 'O']]

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2





In [33]:
import numpy as np 

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape 
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
        
        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list 

In [34]:
from transformers import TrainingArguments

num_epochs = 3 
batch_size = 16
logging_steps = len(panx_de_encoded['train']) // batch_size 
model_name = f'{xlmr_name_or_path}-finetuned-panx-de'

training_args = TrainingArguments(
    output_dir=model_name, log_level='error', num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, evaluation_strategy='epoch', 
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    logging_steps=logging_steps, push_to_hub=False
)

In [35]:
from seqeval.metrics import f1_score 

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {'f1':f1_score(y_true, y_pred)}

In [36]:
from transformers import DataCollatorForTokenClassification

'''
가장 긴 sequence의 길이로 입력 sequence를 padding하기 위해 DataCollator를 사용합니다.
'''
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [37]:
def model_init():
    return XLMRobertaForTokenClassification.from_pretrained(xlmr_name_or_path, config=xlmr_config).to(device)

from transformers import Trainer 

trainer = Trainer(model_init=model_init, args=training_args, 
                  data_collator=data_collator, compute_metrics=compute_metrics, 
                  train_dataset=panx_de_encoded['train'], 
                  eval_dataset=panx_de_encoded['validation'], 
                  tokenizer=xlmr_tokenizer)

In [38]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,0.2648,0.174794,0.817735
2,0.132,0.15006,0.848273
3,0.0828,0.1403,0.859773


TrainOutput(global_step=2361, training_loss=0.1597850874337742, metrics={'train_runtime': 194.2781, 'train_samples_per_second': 194.258, 'train_steps_per_second': 12.153, 'total_flos': 794808799314552.0, 'train_loss': 0.1597850874337742, 'epoch': 3.0})

In [41]:
text_de = 'Jeff Dean ist ein Informatiker bei Google in Kalifornien'
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Tokens,<s>,▁Jeff,▁De,an,▁ist,▁ein,▁Informati,ker,▁bei,▁Google,▁in,▁Kaliforni,en,</s>
Tags,O,B-PER,I-PER,I-PER,O,O,O,O,O,B-ORG,O,B-LOC,I-LOC,O
