In [None]:
!pip install datasets evaluate seqeval optimum onnx onnxruntime transformers -qU

# Load dataset

In [2]:
import torch
import os
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("conll2003", trust_remote_code=True)

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

# Prepare data

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "dslim/bert-large-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [9]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [10]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [11]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [12]:
from torch.utils.data import DataLoader

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [13]:
id2label =  {
    0: "O",
    1: "B-MISC",
    2: "I-MISC",
    3: "B-PER",
    4: "I-PER",
    5: "B-ORG",
    6: "I-ORG",
    7: "B-LOC",
    8: "I-LOC"
  }

In [14]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# Load model

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    device_map='cpu'
)

In [16]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [17]:
print_size_of_model(model)

Size (MB): 1330.283203


## ONNX

In [18]:
from optimum.onnxruntime import ORTModelForTokenClassification

onnx_model_path = './model_onnx'

ort_model = ORTModelForTokenClassification.from_pretrained(
    model_checkpoint,
    export=True,
)

ort_model.save_pretrained(onnx_model_path)
tokenizer.save_pretrained(onnx_model_path)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


('./model_onnx/tokenizer_config.json',
 './model_onnx/special_tokens_map.json',
 './model_onnx/vocab.txt',
 './model_onnx/added_tokens.json',
 './model_onnx/tokenizer.json')

# Evaluate

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
import torch
from tqdm import tqdm

# quantized_model.to('cuda')
for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        batch = {k: v.to('cuda')for k, v in batch.items()}
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    true_predictions, true_labels = postprocess(predictions, labels)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()

In [None]:
results['overall_f1']

np.float64(0.8324359077392911)

# Measure inference time

In [21]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained(onnx_model_path)
# model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER", device_map='cpu')
model = ORTModelForTokenClassification.from_pretrained(onnx_model_path, file_name='model.onnx', provider='CPUExecutionProvider')

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cpu


In [38]:
import time


start = time.time()
ner_results = nlp(' '.join(raw_datasets["test"][3]["tokens"]))
end = time.time()
print(ner_results)

[{'entity_group': 'LOC', 'score': np.float32(0.99978346), 'word': 'Japan', 'start': 0, 'end': 5}, {'entity_group': 'MISC', 'score': np.float32(0.99625134), 'word': 'Asian Cup', 'start': 33, 'end': 42}, {'entity_group': 'LOC', 'score': np.float32(0.99968433), 'word': 'Syria', 'start': 78, 'end': 83}, {'entity_group': 'MISC', 'score': np.float32(0.8476639), 'word': 'Group C', 'start': 89, 'end': 96}]


In [39]:
print(f'Inference time: {1000*(end - start)} ms')

Inference time: 347.9924201965332 ms
