# Benchmarking BERT with ONNX Optimizations

In [4]:
pip install --upgrade --upgrade-strategy eager "optimum[onnxruntime]"

Collecting onnx
  Downloading onnx-1.16.2-cp310-cp310-macosx_11_0_universal2.whl (16.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting onnxruntime>=1.11.0
  Downloading onnxruntime-1.19.2-cp310-cp310-macosx_11_0_universal2.whl (16.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting fsspec>=2023.5.0
  Using cached fsspec-2024.9.0-py3-none-any.whl (179 kB)
Collecting flatbuffers
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl (26 kB)
Installing collected packages: flatbuffers, onnx, onnxruntime, evalua

## Apply ONNX Optimization and export to ONNX

In [2]:
from optimum.onnxruntime import (
    AutoOptimizationConfig,
    ORTOptimizer,
    ORTModelForSequenceClassification,
)

model_id = "nlptown/bert-base-multilingual-uncased-sentiment"
save_dir = "onnx"

# Load a PyTorch model and export it to the ONNX format
model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True)

# Create the optimizer
optimizer = ORTOptimizer.from_pretrained(model)

# Define the optimization strategy by creating the appropriate configuration
optimization_config = AutoOptimizationConfig.O2()

# Optimize the model
optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)

Optimizing model...
Configuration saved in onnx/ort_config.json
Optimized model saved at: onnx (external data format: False; saved all tensor to one file: True)


PosixPath('onnx')

## Compare ONNX Optimized Model to PyTorch Model

### Load models & pipelines

In [48]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# load the pytorch model into a pipeline
tokenizer = AutoTokenizer.from_pretrained(model_id)
pytorch_model = AutoModelForSequenceClassification.from_pretrained(model_id)
pytorch_pipeline = pipeline(
    "text-classification", model=pytorch_model, tokenizer=tokenizer
)

# load the optimized onnx model into a pipeline
onnx_o2_model = ORTModelForSequenceClassification.from_pretrained(
    save_dir, file_name="model_optimized.onnx"
)
onnx_o2_pipeline = pipeline(
    "text-classification", model=onnx_o2_model, tokenizer=tokenizer
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


### Load a dummy dataset

In [52]:
from datasets import load_dataset

ds = load_dataset("tyqiangz/multilingual-sentiments", "all")
sample = ds["train"].select(range(2000))

In [53]:
sample

Dataset({
    features: ['text', 'source', 'language', 'label'],
    num_rows: 2000
})

In [56]:
from time import perf_counter
import numpy as np


def measure_latency(pipe, sample):
    latencies = []

    # warm up
    for _ in range(10):
        _ = pipe(sample[0]["text"])

    # benchmark
    for i in range(len(sample)):

        start_time = perf_counter()
        _ = pipe(sample[i]["text"])
        latency = perf_counter() - start_time

        latencies.append(latency)

    print("Number of samples run:", len(latencies))

    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)

    return f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}"

In [57]:
print("Running PyTorch model...")
print(f"Vanilla model: {measure_latency(pytorch_pipeline, sample)}")
print("\nRunning ONNX model...")
print(f"O2 Optimized: {measure_latency(onnx_o2_pipeline, sample)}")

Running PyTorch model...
Number of samples run: 2000
Vanilla model: Average latency (ms) - 29.87 +\- 5.30

Running ONNX model...
Number of samples run: 2000
O2 Optimized: Average latency (ms) - 14.81 +\- 6.55
