In [26]:
from __future__ import absolute_import, division, print_function

import logging
import math
import os
import random
import tempfile
import warnings
from dataclasses import asdict
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from seqeval.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)
from simpletransformers.config.model_args import NERArgs
from simpletransformers.config.utils import sweep_config_to_sweep_values
from simpletransformers.ner.ner_utils import (
    InputExample,
    LazyNERDataset,
    convert_examples_to_features,
    get_examples_from_df,
    load_hf_dataset,
    read_examples_from_file,
)

from simpletransformers.config.model_args import NERArgs
from simpletransformers.config.utils import sweep_config_to_sweep_values
from simpletransformers.ner.ner_utils import (
    InputExample,
    LazyNERDataset,
    convert_examples_to_features,
    get_examples_from_df,
    load_hf_dataset,
    read_examples_from_file,
)
from tensorboardX import SummaryWriter
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.auto import tqdm, trange
from transformers import (
    AlbertConfig,
    AlbertForTokenClassification,
    AlbertTokenizer,
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,
    BertweetTokenizer,
    BigBirdConfig,
    BigBirdForTokenClassification,
    BigBirdTokenizer,
    CamembertConfig,
    CamembertForTokenClassification,
    CamembertTokenizer,
    DebertaConfig,
    DebertaForTokenClassification,
    DebertaTokenizer,
    DebertaV2Config,
    DebertaV2ForTokenClassification,
    DebertaV2Tokenizer,
    DistilBertConfig,
    DistilBertForTokenClassification,
    DistilBertTokenizer,
    ElectraConfig,
    ElectraForTokenClassification,
    ElectraTokenizer,
    LayoutLMConfig,
    LayoutLMForTokenClassification,
    LayoutLMTokenizer,
    LongformerConfig,
    LongformerForTokenClassification,
    LongformerTokenizer,
    MPNetConfig,
    MPNetForTokenClassification,
    MPNetTokenizer,
    MobileBertConfig,
    MobileBertForTokenClassification,
    MobileBertTokenizer,
    RobertaConfig,
    RobertaForTokenClassification,
    RobertaTokenizerFast,
    SqueezeBertConfig,
    SqueezeBertForTokenClassification,
    SqueezeBertTokenizer,
    XLMConfig,
    XLMForTokenClassification,
    XLMTokenizer,
    XLMRobertaConfig,
    XLMRobertaForTokenClassification,
    XLMRobertaTokenizer,
    XLNetConfig,
    XLNetForTokenClassification,
    XLNetTokenizerFast,
)
from transformers.convert_graph_to_onnx import convert, quantize
from transformers.optimization import AdamW, Adafactor
from transformers.optimization import (
    get_constant_schedule,
    get_constant_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_polynomial_decay_schedule_with_warmup,
)


In [8]:
import time
time1 = time.time()

from simpletransformers.ner import NERModel
# Create a NERModel
#model = NERModel('bert', 'bert-base-cased')
model = NERModel('bert', 'dslim/bert-base-NER', args={
    'learning_rate': 2e-5,
    'overwrite_output_dir': True,
    'reprocess_input_data': True,
    'num_train_epochs': 1,
    "train_batch_size": 150})

print("It takes %2.2f seconds"%(time.time() - time1))
origin_model = model

It takes 6.55 seconds


In [19]:
model = origin_model

In [21]:
to_predict = ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.',]

In [55]:


split_on_space=True
device = origin_model.device
args = origin_model.args
pad_token_label_id = origin_model.pad_token_label_id
model = origin_model.model


preds = None

if split_on_space:
    if origin_model.args.model_type == "layoutlm":
        predict_examples = [
            InputExample(
                i,
                sentence.split(),
                [origin_model.args.labels_list[0] for word in sentence.split()],
                x0,
                y0,
                x1,
                y1,
            )
            for i, (sentence, x0, y0, x1, y1) in enumerate(to_predict)
        ]
        to_predict = [sentence for sentence, *_ in to_predict]
    else:
        predict_examples = [
            InputExample(
                i,
                sentence.split(),
                [origin_model.args.labels_list[0] for word in sentence.split()],
            )
            for i, sentence in enumerate(to_predict)
        ]
else:
    if origin_model.args.model_type == "layoutlm":
        predict_examples = [
            InputExample(
                i,
                sentence,
                [origin_model.args.labels_list[0] for word in sentence],
                x0,
                y0,
                x1,
                y1,
            )
            for i, (sentence, x0, y0, x1, y1) in enumerate(to_predict)
        ]
        to_predict = [sentence for sentence, *_ in to_predict]
    else:
        predict_examples = [
            InputExample(
                i, sentence, [origin_model.args.labels_list[0] for word in sentence]
            )
            for i, sentence in enumerate(to_predict)
        ]

if origin_model.args.onnx:

    # Encode
    model_inputs = origin_model.tokenizer.batch_encode_plus(
        to_predict, return_tensors="pt", padding=True, truncation=True
    )

    # Change shape for batching
    encoded_model_inputs = []
    if origin_model.args.model_type in ["bert", "xlnet", "albert", "layoutlm"]:
        for (input_ids, attention_mask, token_type_ids) in tqdm(
            zip(
                model_inputs["input_ids"],
                model_inputs["attention_mask"],
                model_inputs["token_type_ids"],
            )
        ):
            encoded_model_inputs.append(
                (input_ids, attention_mask, token_type_ids)
            )
    else:
        for (input_ids, attention_mask) in tqdm(
            zip(model_inputs["input_ids"], model_inputs["attention_mask"])
        ):
            encoded_model_inputs.append((input_ids, attention_mask))

    # Setup batches
    eval_sampler = SequentialSampler(encoded_model_inputs)
    eval_dataloader = DataLoader(
        encoded_model_inputs,
        sampler=eval_sampler,
        batch_size=args.eval_batch_size,
    )
    for batch in tqdm(
        eval_dataloader, disable=args.silent, desc="Running Prediction"
    ):
        if origin_model.args.model_type in ["bert", "xlnet", "albert", "layoutlm"]:
            inputs_onnx = {
                "input_ids": batch[0].detach().cpu().numpy(),
                "attention_mask": batch[1].detach().cpu().numpy(),
                "token_type_ids": batch[2].detach().cpu().numpy(),
            }
        else:
            inputs_onnx = {
                "input_ids": batch[0].detach().cpu().numpy(),
                "attention_mask": batch[1].detach().cpu().numpy(),
            }

        # Run the model (None = get all the outputs)
        output = origin_model.model.run(None, inputs_onnx)

        if preds is None:
            preds = output[0]
            out_input_ids = inputs_onnx["input_ids"]
            out_attention_mask = inputs_onnx["attention_mask"]
        else:
            preds = np.append(preds, output[0], axis=0)
            out_input_ids = np.append(
                out_input_ids, inputs_onnx["input_ids"], axis=0
            )
            out_attention_mask = np.append(
                out_attention_mask, inputs_onnx["attention_mask"], axis=0
            )
    out_label_ids = np.zeros_like(out_input_ids)
    for index in range(len(out_label_ids)):
        out_label_ids[index][0] = -100
        out_label_ids[index][-1] = -100
else:

    eval_dataset = origin_model.load_and_cache_examples(
        None, to_predict=predict_examples
    )
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
    )

    origin_model._move_model_to_device()

    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    model.eval()

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if origin_model.args.fp16:
        from torch.cuda import amp

    for batch in tqdm(
        eval_dataloader, disable=args.silent, desc="Running Prediction"
    ):
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = origin_model._get_inputs_dict(batch)

            if origin_model.args.fp16:
                with amp.autocast():
                    outputs = model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]
            else:
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

            if origin_model.args.n_gpu > 1:
                tmp_eval_loss = tmp_eval_loss.mean()
            eval_loss += tmp_eval_loss.item()

        nb_eval_steps += 1

        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
            out_input_ids = inputs["input_ids"].detach().cpu().numpy()
            out_attention_mask = inputs["attention_mask"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(
                out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0
            )
            out_input_ids = np.append(
                out_input_ids,
                inputs["input_ids"].detach().cpu().numpy(),
                axis=0,
            )
            out_attention_mask = np.append(
                out_attention_mask,
                inputs["attention_mask"].detach().cpu().numpy(),
                axis=0,
            )

    eval_loss = eval_loss / nb_eval_steps
token_logits = preds
preds = np.argmax(preds, axis=2)
return 0
label_map = {i: label for i, label in enumerate(origin_model.args.labels_list)}

out_label_list = [[] for _ in range(out_label_ids.shape[0])]
preds_list = [[] for _ in range(out_label_ids.shape[0])]

for i in range(out_label_ids.shape[0]):
    for j in range(out_label_ids.shape[1]):
        if out_label_ids[i, j] != pad_token_label_id:
            out_label_list[i].append(label_map[out_label_ids[i][j]])
            preds_list[i].append(label_map[preds[i][j]])

if split_on_space:
    preds = [
        [
            {word: preds_list[i][j]}
            for j, word in enumerate(sentence.split()[: len(preds_list[i])])
        ]
        for i, sentence in enumerate(to_predict)
    ]
else:
    preds = [
        [
            {word: preds_list[i][j]}
            for j, word in enumerate(sentence[: len(preds_list[i])])
        ]
        for i, sentence in enumerate(to_predict)
    ]

word_tokens = []
for n, sentence in enumerate(to_predict):
    w_log = origin_model._convert_tokens_to_word_logits(
        out_input_ids[n],
        out_label_ids[n],
        out_attention_mask[n],
        token_logits[n],
    )
    word_tokens.append(w_log)

if split_on_space:
    model_outputs = [
        [
            {word: word_tokens[i][j]}
            for j, word in enumerate(sentence.split()[: len(preds_list[i])])
        ]
        for i, sentence in enumerate(to_predict)
    ]
else:
    model_outputs = [
        [
            {word: word_tokens[i][j]}
            for j, word in enumerate(sentence[: len(preds_list[i])])
        ]
        for i, sentence in enumerate(to_predict)
    ]

out = preds, model_outputs

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=2.0, style=ProgressStyle(descrip…




SyntaxError: 'return' outside function (<ipython-input-55-8bb2329bb073>, line 194)

In [72]:
token_logits.shape

(9, 128, 9)

In [59]:
token_logits.shape

(9, 128, 9)

In [68]:
token_logits[0]

array([[ 6.49  , -0.4907, -0.1362, ..., -0.931 , -1.937 , -0.774 ],
       [-0.602 ,  1.201 , -1.62  , ..., -1.763 ,  1.403 , -2.658 ],
       [ 1.371 ,  0.6255,  0.287 , ...,  0.3516, -0.6704, -0.8145],
       ...,
       [ 2.459 ,  0.2203, -1.231 , ..., -1.017 ,  0.2279, -2.223 ],
       [ 2.377 ,  0.3005, -1.313 , ..., -1.168 ,  0.373 , -2.236 ],
       [ 2.316 ,  0.3474, -1.354 , ..., -1.301 ,  0.4443, -2.197 ]],
      dtype=float16)

In [63]:
preds[0]

array([0, 5, 0, 5, 5, 0, 0, 5, 5, 5, 0, 5, 5, 0, 0, 5, 5, 0, 0, 5, 5, 0,
       0, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0, 0, 0, 5, 0, 0, 5, 5, 5, 5, 5,
       5, 0, 0, 0, 5, 0, 0, 5, 5, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0,
       0, 0, 5, 0, 0, 5, 5, 5, 0, 5, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0,
       5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0, 0, 0, 5, 0, 0, 5, 5, 5, 5, 5, 5,
       5, 5, 0, 5, 5, 0, 0, 0, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5])

In [50]:
preds_list = [[] for _ in range(out_label_ids.shape[0])]

In [69]:
for i in range(out_label_ids.shape[0]):
    for j in range(out_label_ids.shape[1]):
        if out_label_ids[i, j] != pad_token_label_id:
            out_label_list[i].append(label_map[out_label_ids[i][j]])
            preds_list[i].append(label_map[preds[i][j]])

In [70]:
preds_list

[['B-ORG'], ['O'], ['B-MISC'], ['O'], ['O'], ['O'], ['B-MISC'], ['O'], ['O']]

In [43]:
out_label_ids

array([[-100,    0, -100, ..., -100, -100, -100],
       [-100,    0, -100, ..., -100, -100, -100],
       [-100,    0, -100, ..., -100, -100, -100],
       ...,
       [-100,    0, -100, ..., -100, -100, -100],
       [-100,    0, -100, ..., -100, -100, -100],
       [-100,    0, -100, ..., -100, -100, -100]])

In [40]:
out_label_ids.shape[1]

128

[[], [], [], [], [], [], [], [], []]

In [36]:
label_map

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [35]:
preds_list

[['B-ORG'], ['O'], ['B-MISC'], ['O'], ['O'], ['O'], ['B-MISC'], ['O'], ['O']]

In [30]:
preds

[[{'EU': 'B-ORG'}],
 [{'rejects': 'O'}],
 [{'German': 'B-MISC'}],
 [{'call': 'O'}],
 [{'to': 'O'}],
 [{'boycott': 'O'}],
 [{'British': 'B-MISC'}],
 [{'lamb': 'O'}],
 [{'.': 'O'}]]

In [29]:
out

([[{'EU': 'B-ORG'}],
  [{'rejects': 'O'}],
  [{'German': 'B-MISC'}],
  [{'call': 'O'}],
  [{'to': 'O'}],
  [{'boycott': 'O'}],
  [{'British': 'B-MISC'}],
  [{'lamb': 'O'}],
  [{'.': 'O'}]],
 [[{'EU': [[-0.602,
      1.201,
      -1.62,
      -1.256,
      -2.738,
      7.86,
      -1.763,
      1.403,
      -2.658]]}],
  [{'rejects': [[10.41,
      -1.088,
      -1.182,
      -1.353,
      -2.336,
      -0.596,
      -1.522,
      -1.652,
      -1.679]]}],
  [{'German': [[-0.4705,
      9.375,
      1.095,
      -1.97,
      -1.378,
      -1.261,
      -2.059,
      -1.479,
      -1.738]]}],
  [{'call': [[8.25,
      -0.9697,
      -2.928,
      -0.985,
      -2.43,
      -0.2115,
      -1.676,
      0.2286,
      -2.107]]}],
  [{'to': [[9.4,
      -1.037,
      -1.727,
      -1.475,
      -2.574,
      -0.5625,
      -1.587,
      -1.129,
      -1.731]]}],
  [{'boycott': [[9.74,
      -0.221,
      -1.785,
      -1.018,
      -3.016,
      -0.1761,
      -1.995,
      -0.8857,
      -