In [1]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0 pytorch-lightning==1.2.7



In [2]:
import itertools
import random
import json
from tqdm import tqdm
import numpy as np
import unicodedata

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForTokenClassification
import pytorch_lightning as pl

MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

In [3]:
normalize = lambda s: unicodedata.normalize('NFKC', s)
print(f'ＡＢＣ -> {normalize("ＡＢＣ")}' )  # 全角アルファベット
print(f'ABC -> {normalize("ABC")}' )        # 半角アルファベット
print(f'１２３ -> {normalize("１２３")}' )  # 全角数字
print(f'123 -> {normalize("123")}' )        # 半角数字
print(f'アイウ -> {normalize("アイウ")}' )  # 全角カタカナ
print(f'ｱｲｳ -> {normalize("ｱｲｳ")}' )        # 半角カタカナ

ＡＢＣ -> ABC
ABC -> ABC
１２３ -> 123
123 -> 123
アイウ -> アイウ
ｱｲｳ -> アイウ


In [4]:
class NER_tokenizer(BertJapaneseTokenizer):

  def encode_plus_tagged(self, text, entities, max_length):
    '''
    文章とそれに含まれる固有表現が与えられた時に符号化とラベル列の作成を行う。
    '''

    # 固有表現の前後で分割し、ラベル付け
    entities = sorted(entities, key=lambda x: x['span'][0])
    splitted = []
    position = 0
    for entity in entities:
      start = entity['span'][0]
      end = entity['span'][1]
      label = entity['type_id']
      # not 固有表現
      splitted.append({'text':text[position:start], 'label':0})
      # 固有表現
      splitted.append({'text':text[start:end], 'label':label})
      position = end
    splitted.append({'text':text[position:], 'label':0})
    #長さ0の文字列は除く
    splitted = [ s for s in splitted if s['text'] ]

    # トークン化し、ラベルをつける
    tokens = []
    labels = []
    for text_splitted in splitted:
      text = text_splitted['text']
      label = text_splitted['label']
      tokens_splitted = tokenizer.tokenize(text)
      labels_splitted = [label] * len(tokens_splitted)
      tokens.extend(tokens_splitted)
      labels.extend(labels_splitted)

    # 符号化を行う
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    encoding = tokenizer.prepare_for_model(
        input_ids,
        max_length = max_length,
        padding='max_length' ,
        truncation=True 
    )
    #[CLS],[SEP] = 0
    labels = [0] + labels[:max_length-2] + [0]
    #[PAD] = 0
    labels = labels + [0]*(max_length-len(labels))
    encoding['labels'] = labels

    return encoding

  def encode_plus_untagged( self, text, max_length=None, return_tensors=None):
    tokens = []
    tokens_original = []
    words = self.word_tokenizer.tokenize(text) #単語に分割
    for word in words:
      # 単語をサブワードに分割
      tokens_word = self.subword_tokenizer.tokenize(word)
      tokens.extend(tokens_word)
      if tokens_word[0] == '[UNK]':
        tokens_original.append(word)
      else:
        tokens_original.extend([
          token.replace('##', '') for token in tokens_word
        ])

    #各トークンの文章中での位置を調べる
    position = 0
    spans = []
    for token in tokens_original:
      l = len(token)
      while 1:
        if token != text[position:position+l]:
          position += 1
        else:
          spans.append([position, position+l])
          position += l
          break

    #符号化
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    encoding = tokenizer.prepare_for_model(
        input_ids,
        max_length=max_length,
        padding='max_length' if max_length else False,
        truncation=True if max_length else False
    )
    sequence_length = len(encoding['input_ids'])
    # [CLS]
    spans = [[-1, -1]] + spans[:sequence_length-2] 
    # [SEP], [PAD]
    spans = spans + [[-1, -1]] * ( sequence_length - len(spans) )

    if return_tensors == 'pt':
      encoding = { k: torch.tensor([v]) for k, v in encoding.items() }

    return encoding, spans

  def convert_bert_output_to_entities(self, text, labels, spans):
    '''
    文章・ラベル列の予測値・各トークンの位置から固有表現を得る
    '''

    # labels, spansから特殊トークンに対応する部分を取り除く
    labels = [label for label, span in zip(labels, spans) if span[0] != -1]
    spans = [span for span in spans if span[0] != -1]

    # 同じラベルが連続するトークンをまとめて、固有表現を抽出する
    entities = []
    for label, group in itertools.groupby(enumerate(labels), key=lambda x: x[1]):
      
      group = list(group)
      start = spans[group[0][0]][0]
      end = spans[group[-1][0]][1]

      if label != 0: #ラベルが0以外ならば、新たな固有表現として追加
        entity = {
            'name':text[start:end],
            'span':[start, end],
            'type_id':label
        }
        entities.append(entity)

    return entities


In [5]:

tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)

In [6]:
text = '昨日のみらい事務所との打ち合わせは順調だった。'
entities = [
    {'name': 'みらい事務所', 'span': [3,9], 'type_id': 1}
]

encoding = tokenizer.encode_plus_tagged(
    text, entities, max_length=20
)
print(encoding)

{'input_ids': [2, 10271, 28486, 5, 546, 10780, 2464, 13, 5, 1878, 2682, 9, 10750, 308, 10, 8, 3, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 'labels': [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [7]:
text = '騰訊の英語名はTencent Holdings Ltdである。'
encoding, spans = tokenizer.encode_plus_untagged(
    text, return_tensors='pt'
)
print('# encoding')
print(encoding)
print('# spans')
print(spans)

# encoding
{'input_ids': tensor([[    2,     1, 26280,     5,  1543,   125,     9,  6749, 28550,  2953,
         28550, 28566, 21202, 28683, 14050, 12475,    12,    31,     8,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
# spans
[[-1, -1], [0, 1], [1, 2], [2, 3], [3, 5], [5, 6], [6, 7], [7, 9], [9, 10], [10, 12], [12, 13], [13, 14], [15, 18], [18, 19], [19, 23], [24, 27], [27, 28], [28, 30], [30, 31], [-1, -1]]


In [8]:
labels_predicted = [0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0]
entities = tokenizer.convert_bert_output_to_entities(
    text, labels_predicted, spans
)
print(entities)

[{'name': '騰訊', 'span': [0, 2], 'type_id': 1}, {'name': 'Tencent Holdings Ltd', 'span': [7, 27], 'type_id': 1}]


In [9]:
tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)
bert_tc = BertForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=4
)
bert_tc = bert_tc.cuda()

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the m

In [10]:
text = 'AさんはB大学に入学した。'

encoding, spans = tokenizer.encode_plus_untagged(
    text, return_tensors='pt'
)
encoding = { k: v.cuda() for k, v in encoding.items() } 

with torch.no_grad():
  output = bert_tc(**encoding)
  scores = output.logits
  labels_predicted = scores[0].argmax(-1).cpu().numpy().tolist()

entities = tokenizer.convert_bert_output_to_entities(
    text, labels_predicted, spans
)
print(entities)

[{'name': 'A', 'span': [0, 1], 'type_id': 1}, {'name': 'さんはB大学に入学した', 'span': [1, 12], 'type_id': 3}]


In [11]:
data = [
    {
        'text': 'AさんはB大学に入学した。',
        'entities': [
            {'name': 'A', 'span': [0, 1], 'type_id': 2},
            {'name': 'B大学', 'span': [4, 7], 'type_id': 1}
        ]
    },
    {
        'text': 'CDE株式会社は新製品「E」を販売する。',
        'entities': [
            {'name': 'CDE株式会社', 'span': [0, 7], 'type_id': 1},
            {'name': 'E', 'span': [12, 13], 'type_id': 3}
        ]
    }
]

max_length=32
dataset_for_loader = []
for sample in data:
  text = sample['text']
  entities = sample['entities']
  encoding = tokenizer.encode_plus_tagged(
      text, entities, max_length=max_length
  )
  encoding = { k : torch.tensor(v) for k, v in encoding.items() }
  dataset_for_loader.append(encoding)
dataloader = DataLoader(dataset_for_loader, batch_size=len(data))

for batch in dataloader:
  batch = { k: v.cuda() for k, v in batch.items()}
  output = bert_tc(**batch)
  loss = output.loss

In [12]:
!git clone --branch v2.0 https://github.com/stockmarkteam/ner-wikipedia-dataset 

fatal: destination path 'ner-wikipedia-dataset' already exists and is not an empty directory.


In [13]:
dataset = json.load(open('ner-wikipedia-dataset/ner.json','r'))

type_id_dict = {
    "人名": 1,
    "法人名": 2,
    "政治的組織名": 3,
    "その他の組織名": 4,
    "地名": 5,
    "施設名": 6,
    "製品名": 7,
    "イベント名": 8
}

for sample in dataset:
  sample['text'] = unicodedata.normalize('NFKC', sample['text'])
  for e in sample['entities']:
    e['type_id'] = type_id_dict[e['type']]
    del e['type']

random.shuffle(dataset)
n = len(dataset)
n_train = int(n*0.6)
n_val = int(n*0.2)
dataset_train = dataset[:n_train]
dataset_val = dataset[n_train:n_train+n_val]
dataset_test = dataset[n_train+n_val:]

In [14]:
def create_dataset(tokenizer, dataset, max_length):
  '''
  データセットをデータローダに入力できるように
  '''
  dataset_for_loader = []
  for sample in dataset:
    text = sample['text']
    entities = sample['entities']
    encoding = tokenizer.encode_plus_tagged(
        text, entities, max_length=max_length
    )
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset_for_loader.append(encoding)
  return dataset_for_loader

tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)

max_length = 128

dataset_train_for_loader = create_dataset(
    tokenizer, dataset_train, max_length
)
dataset_val_for_loader = create_dataset(
    tokenizer, dataset_val, max_length
)

dataloader_train = DataLoader(
    dataset_train_for_loader, batch_size=32, shuffle=True
)

dataloader_val = DataLoader(
    dataset_val_for_loader, batch_size=256
)

In [15]:
class BertForTokenClassification_pl(pl.LightningModule):

  def __init__(self, model_name, num_labels, lr):
    super().__init__()
    self.save_hyperparameters()
    self.bert_tc = BertForTokenClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

  def training_step(self, batch, batch_idx):
    output = self.bert_tc(**batch)
    loss = output.loss
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx):
    output = self.bert_tc(**batch)
    val_loss = output.loss
    self.log('val_loss', val_loss)

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/'
)

#学習方法指定
trainer = pl.Trainer(
    gpus=1,
    max_epochs=5,
    callbacks=[checkpoint]
)

model = BertForTokenClassification_pl(
    MODEL_NAME, num_labels=9, lr=1e-5
)
trainer.fit(model, dataloader_train, dataloader_val)
best_model_path = checkpoint.best_model_path

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
S

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [16]:
def predict(text, tokenizer, bert_tc):
  '''
  BERTで固有表現抽出を行うための関数
  '''

  encoding, spans = tokenizer.encode_plus_untagged(
      text, return_tensors='pt'
  )
  encoding = { k: v.cuda() for k, v in encoding.items() }

  with torch.no_grad():
    output = bert_tc(**encoding)
    scores = output.logits
    labels_predicted = scores[0].argmax(-1).cpu().numpy().tolist()

  entities = tokenizer.convert_bert_output_to_entities(
      text, labels_predicted, spans
  )

  return entities

tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)

model = BertForTokenClassification_pl.load_from_checkpoint(
    best_model_path
)
bert_tc = model.bert_tc.cuda()

entities_list = []
entities_predicted_list = []
for sample in tqdm(dataset_test):
  text = sample['text']
  entities_predicted = predict(text, tokenizer, bert_tc)
  entities_list.append(sample['entities'])
  entities_predicted_list.append( entities_predicted )

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the m

In [17]:
print("# 正解")
print(entities_list[0])
print("# 抽出")
print(entities_predicted_list[0])

# 正解
[{'name': '神戸市立広陵小学校', 'span': [0, 9], 'type_id': 6}, {'name': '広陵少年野球部', 'span': [14, 21], 'type_id': 4}]
# 抽出
[{'name': '神戸', 'span': [0, 2], 'type_id': 5}, {'name': '市立広陵', 'span': [2, 6], 'type_id': 6}, {'name': '広陵少年野球部', 'span': [14, 21], 'type_id': 4}]


In [18]:
def evaluate_model(entities_list, entities_predicted_list, type_id=None):
    """
    正解と予測を比較し、モデルの固有表現抽出の性能を評価する。
    type_idがNoneのときは、全ての固有表現のタイプに対して評価する。
    type_idが整数を指定すると、その固有表現のタイプのIDに対して評価を行う。
    """
    num_entities = 0 # 固有表現(正解)の個数
    num_predictions = 0 # BERTにより予測された固有表現の個数
    num_correct = 0 # BERTにより予測のうち正解であった固有表現の数

    for entities, entities_predicted in zip(entities_list, entities_predicted_list):
      if type_id:
        entities = [ e for e in entities if e['type_id'] == type_id ]
        entities_predicted = [ e for e in entities_predicted if e['type_id'] == type_id]
      
      get_span_type = lambda w: (e['span'][0], e['span'][1], e['type_id'])
      set_entities = set( get_span_type(e) for e in entities)
      set_entities_predicted = set(get_span_type(e) for e in entities_predicted)

      num_entities += len(entities)
      num_predictions += len(entities_predicted)
      num_correct += len( set_entities & set_entities_predicted)

    precision = num_correct / num_predictions
    recall = num_correct / num_entities
    f_value = 2*precision*recall / (precision+recall)

    result = {
        'num_entities': num_entities,
        'num_predictions': num_predictions,
        'num_correct': num_correct,
        'precision': precision,
        'recall': recall,
        'f_value': f_value
    }

    return result

In [19]:
print( evaluate_model(entities_list, entities_predicted_list) )


{'num_entities': 2675, 'num_predictions': 2737, 'num_correct': 984, 'precision': 0.3595177201315309, 'recall': 0.36785046728971965, 'f_value': 0.36363636363636365}


In [19]:
class NER_tokenizer_BIO(BertJapaneseTokenizer):

  def __init__(self, *args, **kwargs):
    self.num_entity_type = kwargs.pop('num_entity_type') #固有表現のカテゴリーの数
    super().__init__(*args, **kwargs)

  def encode_plus_tagged(self, text, entitles, max_length):
    '''
    文章と固有表現から
    文章の符号化, タグ列の作成
    '''

    # 固有表現の前後でtextを分割し、それぞれのラベルをつけておく
    splitted = []
    position = 0
    for entity in entities:
      start = entity['span'][0]
      end = entity['span'][1]
      label = entity['type_id']
      # 固有表現ではないもの(前)
      splitted.append({'text':text[position:start], 'label':0})
      # 固有表現
      splitted.append({'text':text[start:end], 'label':label})
      position = end
    # 固有表現ではないもの（後）
    splitted.append({'text':text[position:start], 'label':0})
    splitted = [ s for s in splitted if s['text']]

    #分割されたそれぞれの文章をトーク化し、ラベルをつける
    tokens = []
    labels = []
    for s in splitted:
      token_splitted = tokenizer.tokenize(s['text'])
      label = s['label']
      if label > 0: #固有表現
        # トークン全てにl-タグ
        labels_splitted = \
          [ label + self.num_entity_type ] * len(tokens_splitted)
        # 先頭トークンB-タグ
        labels_splitted[0] = label
      else: #固有表現ではないもの
        labels_splitted = [0] * len(tokens_splitted)

    # 符号化
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    encoding = tokenizer.prepare_for_model(
        inputs_ids,
        max_length=max_length,
        padding='max_length',
        truncation=True
    )

    #ラベルに特殊トークン[CLS], [SEP], [PAD]を追加
    labels = [0] + labels[:max_length - 2] + [0]
    labels = labels + [0] * (max_length - len(labels))
    encoding['labels'] = labels

    return encoding

  def encode_plus_untagged( self, text, max_length=None, return_tensors=None):
    '''
    文章をトークン化し、それぞれのトークンの文章中の位置を特定
    '''

    tokens = [] #トークン
    tokens_original = [] #トークンに対応する文章中の文字列
    words = self.word_tokenizer.tokenize(text)
    for word in words:
      # 単語をサブワードに分割
      tokens_word = self.subword_tokenizer.tokenize(word)
      tokens.extend(tokens_word)
      if tokens_word[0] == '[UNK]': #未知語
        tokens_original.append(word)
      else: 
        tokens_orignal.extend([
            token.replace('##', '') for token in tokens_word
        ])

    #各トークンの文章中での位置
    position = 0
    spans = []
    for token in tokens_original:
      l = len(token)
      while 1:
        if token != text[position:position+l]:
          position += 1
        else:
          spans.append([position, position+l])
          position += l
          break

    #符号化
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    encoding = tokenizer.prepare_for_model(
        input_ids,
        max_length=max_length,
        padding='max_length' if max_length else False,
        truncation=True if max_length else False
    )
    sequence_length = len(encoding['input_ids'])
    # [CLS]
    spans = [[-1, -1]] + spans[ : sequence_length - 2]
    # [SEP] [PAD]
    spans = spans + [[-1, -1]] * (sequence_length - len(spans))

    if return_tensors == 'pt':
      encoding = { k: torch.tensor([v]) for k, v in encoding.items() }

    return encoding, spans

  @staticmethod
  def Viterbi(scores_bert, num_entity_type, penalty=10000):
    '''
    ビタビアルゴリズム
    '''

    m = 2*num_entity_type + 1
    penalty_matrix = np.zeros([m, m]) 
    for i in range(m):
      for j in range(1+num_entity_type, m):
        if not ( (i==j) or (i+num_entity_type == j)):
          penalty_matrix[i, j] == penalty

    path = [ [i] for i in range (m)]
    scores_path = scores_bert[0] - penalty_matrix[0, :]　
    scores_bert = scores_bert[1:] # 一番上の行を取り除く

    for scores in scores_bert:
      assert len(scores) == 2*num_entity_type + 1
      score_matrix = np.array(scores_path).reshape(-1, 1) \
                  + np.array(scores).reshape(1, -1) \
                  - penalty_matrix
      scores_path = score_matrix.max(axis=0)
      argmax = score_matrix.argmax(axis=0)
      path_new = []
      for i, idx in enumerate(argmax):
        path_new.append(path[idx] + [i])
      path = path_new

    labels_optimal = path[np.argmax(scores_path)]
    return labels_optimal

  def convert_bert_output_to_entities(self, text, scores, spans):
    '''
    文章・分類スコア(系列長・ラベル数)・各トークンの位置 ----> 固有表現
    '''

    assert len(spans) == len(scores)
    num_entity_type = self.num_entity_type

    #　特殊トークンに対応する部分を取り除く
    scores = [ score for score,span in zip(scores, spans) if span[0] != -1 ]
    spans = [ span for span in spans if span[0] != -1]

    #　Viterbiアルゴリズム
    labels = self.Viterbi(scores, num_entity_type)

    # 同じラベルが連続するトークンをまとめて、固有表現を抽出する
    entities = []
    for label, group in itertools.groupby(enumerate(labels), key=lambda x: x[1]):

      group = list(group)
      start = spans[group[0][0]][0]
      end = spans[group[-1][0]][1]

      if label != 0: #固有表現
        if 1 <= labels <= num_entity_type: # B- ----> 追加
          entity = {
              'name':text[start:end],
              'span':[start, end],
              'type':label
          }
          entities.append(entity)
        else: # l- ----> 更新
          entity['span'][1] = end
          entity['name'] = text[entity['span'][0]:entity['span'][1]]

    return entities




In [11]:
tokenizer = NER_tokenizer_BIO.from_pretrained(
    MODEL_NAME,
    num_entity_type=8
)

max_length = 128
dataset_train_for_loader = create_dataset(
    tokenizer, dataset_train, max_length
)
dataset_val_for_loader = create_dataset(
    tokenizer, dataset_val, max_length
)

dataloader_train = DataLoader(
    dataset_train_for_loader, batch_size=32, shuffle=True
)
dataset_val_for_loader = DataLoader(
    dataset_val_for_loader, batch_size=256
)

checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model_BIO/'
)

trainer = pl.Trainer(
    gpus=1,
    max_epochs=5,
    callbacks=[checkpoint]
)

num_entity_type = 8
num_labels = 2*num_entity_type + 1
model = BertForTokenClassification_pl(
    MODEL_NAME, num_labels=num_labels, lr=1e-5
)

trainer.fit(model, dataloader_train, dataloader_val)
best_model_path = checkpoint.best_model_path

model = BertForTokenClassification_pl.load_from_checkpoint(
    best_model_path
)
bert_tc = model.bert_tc.cuda()


In [1]:
entities_list = []
entities_predicted_list = []
for sample in tqdm(dataset_test):
  text = sample['text']
  encoding, spans = tokenizer.encode_plus_untagged(
      text, return_tensors='pt'
  )

  with torch.no_grad():
    output = bert_tc(**encoding)
    scores = output.logits
    scores = scores[0].cpu().numpy().tolist()

  entities_predicted = tokenizer.convert_bert_output_to_entities(
        text, scores, spans
    )

  entities_list.append(sample['entities'])
  entities_predicted_list.append(entities_predicted)

print(evaluate_model(entities_list, entities_predicted_list))
