In [1]:
import os
import time

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, pipeline

In [2]:
clinc = load_dataset("clinc_oos", "plus")

Found cached dataset clinc_oos (/root/.cache/huggingface/datasets/clinc_oos/plus/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
sample = clinc["train"][0]
print(sample)

# Каждый объект датасета содержит текст и соответствующее ему намерение.

{'text': 'what expression would i use to say i love you if i were an italian', 'intent': 61}


Намерения предоставляются в виде идентификаторов, но мы можем легко получить его значение (и наоборот), вызвав функцию int2str:

In [4]:
intents = clinc["train"].features["intent"]
intent = intents.int2str(sample["intent"])
print(intent)

translate


In [5]:
accuracy_score = load_metric("accuracy")


def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

  accuracy_score = load_metric("accuracy")


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels = intents.num_classes

checkpoint = "transformersbook/bert-base-uncased-finetuned-clinc"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).to(device)

In [8]:
print("Base Model: ", model.num_parameters())

Base Model:  109598359


In [9]:
sample_input = clinc['train']['text'][101]

print(clinc['train']['text'][101])
print(clinc['train']['intent'][101])

complete a transaction from savings to checking of $20000
133


In [10]:
pipe = pipeline("text-classification", model=model, tokenizer='bert-base-uncased', device=0)

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

In [12]:
#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

#INFERENCE
start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_model = time.time() - start
print("Общее время обработки 100 запросов базовой моделью:", total_time_model)

Общее время обработки 100 запросов базовой моделью: 0.6840972900390625


In [52]:
from typing import Callable
from tqdm.notebook import tqdm


data_test_X = clinc['test']['text'][::50]
data_test_y = clinc['test']['intent'][::50]

def show_accuracy(model: Callable):
    model_preds = []
    for i in tqdm(data_test_X):
        model_preds.append(label2id[model(i)[0]['label']])

    print(accuracy_score.compute(predictions=model_preds, references=data_test_y))

In [53]:
show_accuracy(pipe)

  0%|          | 0/110 [00:00<?, ?it/s]

{'accuracy': 0.8363636363636363}


In [14]:
model.save_pretrained("transformersbook/bert-base-uncased-finetuned-clinc")

In [30]:
from optimum.pipelines import pipeline
from optimum.onnxruntime import ORTOptimizer, ORTModelForSequenceClassification
from optimum.onnxruntime.configuration import OptimizationConfig

In [82]:
import onnxruntime

sess = onnxruntime.InferenceSession("optimized_model/model_optimized.onnx", providers=['CUDAExecutionProvider'])
session_options = onnxruntime.SessionOptions()
session_options.log_severity_level = 0

### Using optimum pipeline

In [43]:
optimum_model1 = pipeline(
        task="text-classification", 
        model=checkpoint, 
        tokenizer='bert-base-uncased', 
        accelerator="ort",
        device=0
)

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Using framework PyTorch: 2.0.0+cu117
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
use_io_binding was set to False, setting it to True because it can provide a huge speedup on GPUs. It is possible to disable this feature manually by setting the use_io_binding attribute back to False.


In [45]:
#WARMUP
for _ in range(10):
  _ = optimum_model1(sample_input)

#INFERENCE
start = time.time()
for _ in range(100):
  _ = optimum_model1(sample_input)
total_time_model = time.time() - start
print("Общее время обработки 100 запросов базовой моделью из optimum:", total_time_model)

Общее время обработки 100 запросов базовой моделью из optimum: 0.29105353355407715


### Using ORTModelForSequenceClassification

In [35]:
model_id = checkpoint
save_dir = "optimized_model"

tokenizer = AutoTokenizer.from_pretrained(model_id)
optimum_model2 = ORTModelForSequenceClassification.from_pretrained(
    model_id, 
    export=True,
    provider="CUDAExecutionProvider",
    session_options=session_options
)

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Using framework PyTorch: 2.0.0+cu117
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [84]:
# Load the optimization configuration detailing the optimization we wish to apply
# optimization_config = AutoOptimizationConfig.O3()
optimization_config = OptimizationConfig(
    # optimize_for_gpu=True,
    optimization_level=99
)
optimizer = ORTOptimizer.from_pretrained(optimum_model2)

optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)
# Load the optimized model from a local repository
optimized_model1 = ORTModelForSequenceClassification.from_pretrained(save_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Optimizing model...
symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model
symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model
symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model
symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model
symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model
symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model
symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model
symbolic shape i

failed in shape inference <class 'AssertionError'>
failed in shape inference <class 'AssertionError'>


Configuration saved in optimized_model/ort_config.json
Optimized model saved at: optimized_model (external data format: False; saved all tensor to one file: True)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [85]:
# Create the transformers pipeline
onnx_clx = pipeline("text-classification", model=optimized_model1, accelerator="ort", device=0)

use_io_binding was set to False, setting it to True because it can provide a huge speedup on GPUs. It is possible to disable this feature manually by setting the use_io_binding attribute back to False.


In [86]:
#WARMUP
for _ in range(10):
  _ = onnx_clx(sample_input)

#INFERENCE
start = time.time()
for _ in range(100):
  _ = onnx_clx(sample_input)
total_time_model = time.time() - start
print("Общее время обработки 100 запросов базовой моделью:", total_time_model)



Общее время обработки 100 запросов базовой моделью: 0.2588021755218506


In [65]:
show_accuracy(onnx_clx)

  0%|          | 0/110 [00:00<?, ?it/s]

{'accuracy': 0.8363636363636363}
