In [2]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AutoTokenizer, AutoModel
from transformers import AutoModelForTokenClassification

from onnxruntime.quantization import quantize_dynamic, QuantType
import torch.nn.functional as F
from pathlib import Path
import onnxruntime
import numpy as np
import torch
import onnx
import time
import os
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_type = 'main_ner' 
model_quant = 'optimized_main_ner/' + 'model.onnx'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_type) 

In [15]:
tokenizer.save_pretrained('tokenizer_onnx')

('tokenizer_onnx/tokenizer_config.json',
 'tokenizer_onnx/special_tokens_map.json',
 'tokenizer_onnx/sentencepiece.bpe.model',
 'tokenizer_onnx/added_tokens.json',
 'tokenizer_onnx/tokenizer.json')

In [16]:
from transformers import pipeline
import json
from tqdm import tqdm
import pandas as pd
config = json.load(open("main_ner_wlang/config.json"))
with open('config.json', 'w') as f:
    json.dump(config, f, ensure_ascii=False)
id2label = config['id2label']

In [38]:
import numpy as np
import torch
import torch.nn.functional as F

def run_ner_inference(input_text):
    encoded_input = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
    input_ids = encoded_input['input_ids'].numpy()
    attention_mask = encoded_input['attention_mask'].numpy()

    ort_inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

    ort_outs = ort_session.run(['logits'], ort_inputs)
    logits = ort_outs[0]

    probabilities = F.softmax(torch.from_numpy(logits), dim=-1).numpy()

    return probabilities, input_ids

ort_session = onnxruntime.InferenceSession(model_quant, providers=["CPUExecutionProvider"])
input_text = "100 тг" 
probabilities, input_ids = run_ner_inference(input_text)

predicted_classes = np.argmax(probabilities, axis=-1)

id2label = config['id2label']

predicted_labels = [id2label[str(class_id)] for class_id in predicted_classes[0]]

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])


In [39]:
def clean_and_combine_tokens(tokens, labels):
    cleaned_tokens = []
    cleaned_labels = []
    current_word = ""
    current_label = None

    for token, label in zip(tokens, labels):
        if token in ["<s>", "</s>", "<unk>"]:  
            continue
        
        if token.startswith("▁"):  
            if current_word:  
                cleaned_tokens.append(current_word)
                cleaned_labels.append(current_label)
            current_word = token[1:]  
            current_label = label
        else:  
            current_word += token

        if current_label == "O":
            current_label = label

    if current_word:
        cleaned_tokens.append(current_word)
        cleaned_labels.append(current_label)

    return cleaned_tokens, cleaned_labels
clean_tokens, clean_labels = clean_and_combine_tokens(tokens, predicted_labels)
for token, label in zip(clean_tokens, clean_labels):
    print(f"token: {token} \t label: {label}")

token: 100 	 label: B-MONEY
token: тг 	 label: I-MONEY


In [34]:
import evaluate
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset


datasets = load_dataset("kaznerd.py")
label_list = datasets["train"].features["ner_tags"].feature.names


example_text = datasets['train'][0]
tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

# Assuming these are loaded from your previous code
# datasets = load_dataset("kaznerd.py") 
# label_list = datasets["train"].features["ner_tags"].feature.names
# id2label = {...} # from your config
# tokenizer is loaded as shown in your code
# ort_session is the ONNX runtime session

metric = evaluate.load("seqeval")

def run_ner_inference(input_text, ort_session, tokenizer):
    encoded_input = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt", add_special_tokens=True)
    input_ids = encoded_input['input_ids'].numpy()
    attention_mask = encoded_input['attention_mask'].numpy()

    ort_inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

    ort_outs = ort_session.run(['logits'], ort_inputs)
    logits = ort_outs[0]

    probabilities = F.softmax(torch.from_numpy(logits), dim=-1).numpy()
    return probabilities, input_ids

def clean_and_combine_tokens(tokens, labels):
    cleaned_tokens = []
    cleaned_labels = []
    current_word = ""
    current_label = None

    for token, label in zip(tokens, labels):
        if token in ["<s>", "</s>", "<unk>"]:
            continue
        
        if token.startswith("▁"):  # SentencePiece subword boundary
            if current_word:
                cleaned_tokens.append(current_word)
                cleaned_labels.append(current_label)
            current_word = token[1:]  # remove the underscore
            current_label = label
        else:
            current_word += token

        # If the label is "O", just carry forward the last known non-O label or O
        if current_label == "O":
            current_label = label

    if current_word:
        cleaned_tokens.append(current_word)
        cleaned_labels.append(current_label)

    return cleaned_tokens, cleaned_labels

# We will evaluate on the "test" split of the dataset
test_dataset = datasets["test"]

In [23]:
from tqdm import tqdm

In [35]:
all_predictions = []
all_references = []

for example in tqdm(test_dataset,total=len(test_dataset)):
    original_tokens = example["tokens"]
    original_label_ids = example["ner_tags"]
    true_labels = [label_list[i] for i in original_label_ids]

    input_text = " ".join(original_tokens)
    
    probabilities, input_ids = run_ner_inference(input_text, ort_session, tokenizer)
    predicted_classes = np.argmax(probabilities, axis=-1)  # shape: (batch_size=1, seq_len)
    predicted_labels = [id2label[str(class_id)] for class_id in predicted_classes[0]]

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    clean_tokens, clean_preds = clean_and_combine_tokens(tokens, predicted_labels)

    if len(clean_preds) != len(original_tokens):
        continue

    all_predictions.append(clean_preds)
    all_references.append(true_labels)

# Compute seqeval metrics
results = metric.compute(predictions=all_predictions, references=all_references)
print("Evaluation on test dataset:")
for key, value in results.items():
    if key.startswith("overall_"):
        print(f"{key}: {value:.4f}")

100%|██████████| 11307/11307 [04:09<00:00, 45.31it/s]


Evaluation on test dataset:
overall_precision: 0.9600
overall_recall: 0.9696
overall_f1: 0.9648
overall_accuracy: 0.9918
