In [1]:
!pip install eager optimum[onnxruntime] timm onnxruntime-gpu

Collecting eager
  Downloading eager-0.0.1.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting optimum[onnxruntime]
  Downloading optimum-1.25.2-py3-none-any.whl.metadata (16 kB)
Collecting onnxruntime>=1.11.0 (from optimum[onnxruntime])
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=1.2.1->optimum[onnxruntime])
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11->optimum[onnxruntime])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.

In [2]:
!pip install gputil

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=3826ee4637902942dd9e78087febf70d03c342e3a201ff5684214cdb17e61634
  Stored in directory: /root/.cache/pip/wheels/2b/4d/8f/55fb4f7b9b591891e8d3f72977c4ec6c7763b39c19f0861595
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [3]:
import os
import time

import psutil
import GPUtil
import kagglehub
import numpy as np
from tqdm import tqdm

import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

import torch
from torch import nn, optim
from torch.amp import autocast
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader
from torch.nn.utils import prune

from sklearn.metrics import precision_score, recall_score, f1_score

In [4]:
def print_memory_usage(label=""):
    """Выводит использование памяти в мегабайтах"""
    if label:
        print(f"\n--- Memory Usage ({label}) ---")
    else:
        print("\n--- Memory Usage ---")
        
    # CPU RAM в MB
    process = psutil.Process(os.getpid())
    ram_used = process.memory_info().rss / (1024 ** 2)
    print(f"CPU RAM used: {ram_used:.2f} MB")
    
    # GPU VRAM в MB
    gpus = GPUtil.getGPUs()
    for gpu in gpus:
        vram_used = gpu.memoryUsed
        vram_total = gpu.memoryTotal
        print(f"GPU {gpu.id} VRAM: {vram_used:.2f} MB / {vram_total:.2f} MB")


def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    return (param_size + buffer_size) / 1024**2


def calculate_metrics(model, device, data_loader):
    model.eval()
    all_preds = []
    all_targets = []

    model.to(device)
    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_targets.extend(targets.numpy())
    
    precision = precision_score(all_targets, all_preds, average='macro')
    recall = recall_score(all_targets, all_preds, average='macro')
    f1 = f1_score(all_targets, all_preds, average='macro')
    
    return precision, recall, f1


def load_imagenet_mini(dataset_path, model, split):
    config = resolve_data_config({}, model=model)
    transform = create_transform(**config)

    print(transform)

    if split == 'train':
        transform = transforms.Compose([
            transforms.ColorJitter(
                brightness=0.2,
                contrast=0.2,
                saturation=0.2,
                hue=0.2,
            ),
            transforms.GaussianBlur(
                kernel_size=3,
            ),
            transform,
        ])

    dataset = datasets.ImageFolder(
        root=os.path.join(dataset_path, split),
        transform=transform
    )
    
    return dataset


def benchmark_model(model, device, input_tensor, num_runs=10, warmup=3, use_amp=False):
    model = model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Warmup
    print(f"\n🔥 Warming up ({warmup} runs) on {device}...")
    for _ in range(warmup):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    # Benchmark
    print(f"🚀 Benchmarking ({num_runs} runs) on {device}...")
    start_time = time.time()
    
    for _ in range(num_runs):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    total_time = (time.time() - start_time) * 1000
    avg_time = total_time / num_runs
    print(f"✅ Average inference: {avg_time:.2f} ms")
    print(f"📊 Total time: {total_time:.2f} ms | FPS: {1000/(avg_time + 1e-9):.1f}")
    
    return avg_time


def main(model, dataset_path=None):
    device_cpu = torch.device('cpu')
    device_gpu = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print("\n🔍 Initial memory state:")
    print_memory_usage("Before loading model")
    model.eval()
    print(f"📏 Model size: {get_model_size(model):.2f} MB")

    print("\n🔍 Initial memory state:")
    print_memory_usage("Model loaded")
    
    # Бенчмарки
    input_tensor = torch.randn(1, 3, 224, 224)
    
    print("\n🧪 Benchmarking on CPU:")
    cpu_time = benchmark_model(model, device_cpu, input_tensor)
    print_memory_usage("After CPU benchmark")
    
    if torch.cuda.is_available():
        print("\n🎮 Benchmarking on GPU:")
        gpu_time = benchmark_model(model, device_gpu, input_tensor)
        print_memory_usage("After GPU test")
        
        print("\n⚡ Benchmarking with AMP:")
        gpu_amp_time = benchmark_model(model, device_gpu, input_tensor, use_amp=True)
        print_memory_usage("After AMP test")
        
        print("\n📈 Results Summary:")
        print(f"| Device | Inference Time (ms) | Speedup vs CPU |")
        print("|--------|---------------------|----------------|")
        print(f"| CPU    | {cpu_time:19.2f} | {'—':^15} |")
        print(f"| GPU    | {gpu_time:19.2f} | {cpu_time/gpu_time:^15.1f}x |")
        print(f"| AMP    | {gpu_amp_time:19.2f} | {cpu_time/gpu_amp_time:^15.1f}x |")
    else:
        print("\n❌ CUDA not available")
        print(f"⏱️ CPU inference time: {cpu_time:.2f} ms")
        if dataset_path:
            print("\n🎯 Quality Metrics (CPU):")
            print(f"Precision: {precision_cpu:.4f}")
            print(f"Recall:    {recall_cpu:.a4f}")
            print(f"F1-Score:  {f1_cpu:.4f}")

    # Загрузка и расчет метрик качества
    if dataset_path:
        print("\n📊 Loading ImageNetMini dataset...")
        val_dataset = load_imagenet_mini(dataset_path, model, 'val')
        data_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, pin_memory=True, drop_last=False)
        
        # print("\n🧮 Calculating metrics on CPU:")
        # precision_cpu, recall_cpu, f1_cpu = calculate_metrics(model, device_cpu, data_loader)
        
        if torch.cuda.is_available():
            print("\n🧮 Calculating metrics on GPU:")
            precision_gpu, recall_gpu, f1_gpu = calculate_metrics(model, device_gpu, data_loader)

            print("\n🎯 Quality Metrics Summary:")
            print("| Device | Precision | Recall  | F1-Score |")
            print("|--------|-----------|---------|----------|")
            # print(f"| CPU    | {precision_cpu:.4f}  | {recall_cpu:.4f} | {f1_cpu:.4f}  |")
            print(f"| GPU    | {precision_gpu:.4f}  | {recall_gpu:.4f} | {f1_gpu:.4f}  |")

In [5]:
path = kagglehub.dataset_download("ifigotin/imagenetmini-1000")
path += "/imagenet-mini"

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imagenetmini-1000/imagenet-mini


In [6]:
base_model = timm.create_model('efficientvit_b3.r256_in1k', pretrained=True)

model.safetensors:   0%|          | 0.00/195M [00:00<?, ?B/s]

In [7]:
main(base_model, path)


🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 908.25 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB
📏 Model size: 185.75 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 908.25 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 144.25 ms
📊 Total time: 1442.46 ms | FPS: 6.9

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 955.62 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 23.16 ms
📊 Total time: 231.59 ms | FPS: 43.2

--- Memory Usage (After GPU test) ---
CPU RAM used: 1076.43 MB
GPU 0 VRAM: 539.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 28.60 ms
📊 Total time: 286.03 ms | FPS: 35.0

--- Memory Usage (After AMP test) ---
CPU RA

  _warn_prf(average, modifier, msg_start, len(result))


## Optimum optimization

In [8]:
def print_memory_usage(label=""):
    """Выводит использование памяти в мегабайтах"""
    if label:
        print(f"\n--- Memory Usage ({label}) ---")
    else:
        print("\n--- Memory Usage ---")
        
    # CPU RAM в MB
    process = psutil.Process(os.getpid())
    ram_used = process.memory_info().rss / (1024 ** 2)
    print(f"CPU RAM used: {ram_used:.2f} MB")
    
    # GPU VRAM в MB
    gpus = GPUtil.getGPUs()
    for gpu in gpus:
        vram_used = gpu.memoryUsed
        vram_total = gpu.memoryTotal
        print(f"GPU {gpu.id} VRAM: {vram_used:.2f} MB / {vram_total:.2f} MB")


def calculate_metrics(model, device, data_loader):
    all_preds = []
    all_targets = []

    model.to(device)
    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            outputs = model(images).logits
            preds = outputs.argmax(dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_targets.extend(targets.numpy())
    
    precision = precision_score(all_targets, all_preds, average='macro')
    recall = recall_score(all_targets, all_preds, average='macro')
    f1 = f1_score(all_targets, all_preds, average='macro')
    
    return precision, recall, f1


def load_imagenet_mini(dataset_path, model, split):
    config = resolve_data_config({}, model=base_model)
    transform = create_transform(**config)

    print(transform)

    dataset = datasets.ImageFolder(
        root=os.path.join(dataset_path, split),
        transform=transform
    )
    
    return dataset


def benchmark_model(model, device, input_tensor, num_runs=10, warmup=3, use_amp=False):
    model = model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Warmup
    print(f"\n🔥 Warming up ({warmup} runs) on {device}...")
    for _ in range(warmup):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    # Benchmark
    print(f"🚀 Benchmarking ({num_runs} runs) on {device}...")
    start_time = time.time()
    
    for _ in range(num_runs):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    total_time = (time.time() - start_time) * 1000
    avg_time = total_time / num_runs
    print(f"✅ Average inference: {avg_time:.2f} ms")
    print(f"📊 Total time: {total_time:.2f} ms | FPS: {1000/(avg_time + 1e-9):.1f}")
    
    return avg_time


def main(model, dataset_path=None):
    device_cpu = torch.device('cpu')
    device_gpu = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print("\n🔍 Initial memory state:")
    print_memory_usage("Before loading model")

    print("\n🔍 Initial memory state:")
    print_memory_usage("Model loaded")
    
    # Бенчмарки
    input_tensor = torch.randn(1, 3, 256, 256)
    
    print("\n🧪 Benchmarking on CPU:")
    cpu_time = benchmark_model(model, device_cpu, input_tensor)
    print_memory_usage("After CPU benchmark")
    
    if torch.cuda.is_available():
        print("\n🎮 Benchmarking on GPU:")
        gpu_time = benchmark_model(model, device_gpu, input_tensor)
        print_memory_usage("After GPU test")
        
        print("\n⚡ Benchmarking with AMP:")
        gpu_amp_time = benchmark_model(model, device_gpu, input_tensor, use_amp=True)
        print_memory_usage("After AMP test")
        
        print("\n📈 Results Summary:")
        print(f"| Device | Inference Time (ms) | Speedup vs CPU |")
        print("|--------|---------------------|----------------|")
        print(f"| CPU    | {cpu_time:19.2f} | {'—':^15} |")
        print(f"| GPU    | {gpu_time:19.2f} | {cpu_time/gpu_time:^15.1f}x |")
        print(f"| AMP    | {gpu_amp_time:19.2f} | {cpu_time/gpu_amp_time:^15.1f}x |")
    else:
        print("\n❌ CUDA not available")
        print(f"⏱️ CPU inference time: {cpu_time:.2f} ms")
        if dataset_path:
            print("\n🎯 Quality Metrics (CPU):")
            print(f"Precision: {precision_cpu:.4f}")
            print(f"Recall:    {recall_cpu:.a4f}")
            print(f"F1-Score:  {f1_cpu:.4f}")

    # Загрузка и расчет метрик качества
    if dataset_path:
        print("\n📊 Loading ImageNetMini dataset...")
        val_dataset = load_imagenet_mini(dataset_path, model, 'val')
        data_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, pin_memory=True, drop_last=False)
        
        # print("\n🧮 Calculating metrics on CPU:")
        # precision_cpu, recall_cpu, f1_cpu = calculate_metrics(model, device_cpu, data_loader)
        
        if torch.cuda.is_available():
            print("\n🧮 Calculating metrics on GPU:")
            precision_gpu, recall_gpu, f1_gpu = calculate_metrics(model, device_gpu, data_loader)

            print("\n🎯 Quality Metrics Summary:")
            print("| Device | Precision | Recall  | F1-Score |")
            print("|--------|-----------|---------|----------|")
            # print(f"| CPU    | {precision_cpu:.4f}  | {recall_cpu:.4f} | {f1_cpu:.4f}  |")
            print(f"| GPU    | {precision_gpu:.4f}  | {recall_gpu:.4f} | {f1_gpu:.4f}  |")

In [9]:
from optimum.onnxruntime import ORTModelForImageClassification
from transformers import AutoFeatureExtractor
from pathlib import Path

2025-05-16 06:22:40.000308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747376560.218091      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747376560.288553      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
model_id="timm/efficientvit_b3.r256_in1k"
onnx_path = Path("onnx")

model = ORTModelForImageClassification.from_pretrained(model_id, export=True)
model.save_pretrained(onnx_path)

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

You are using a model of type timm_wrapper to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(
The `save_pretrained` method is disabled for TimmWrapperImageProcessor. The image processor configuration is saved directly in `config.json` when `save_pretrained` is called for saving the model.


In [11]:
!rm onnx/model.onnx

In [12]:
import torch

base_model.eval()

with torch.no_grad():
    torch.onnx.export(
        base_model,
        torch.rand(1, 3, 256, 256).to('cuda'),
        'onnx/model.onnx',
        opset_version=15,
        input_names=['pixel_values'],
        output_names=['logits'],
        dynamic_axes={'pixel_values': {0: 'batch_size'}, 'logits': {0: 'batch_size'}},
    )

In [13]:
model_id="timm/efficientvit_b3.r256_in1k"
onnx_path = Path("onnx")

model = ORTModelForImageClassification.from_pretrained(onnx_path, file_name="model.onnx")

You are using a model of type timm_wrapper to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


In [14]:
main(model, path)


🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 2967.36 MB
GPU 0 VRAM: 2199.00 MB / 16384.00 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 2967.36 MB
GPU 0 VRAM: 2199.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 105.49 ms
📊 Total time: 1054.92 ms | FPS: 9.5

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 2822.61 MB
GPU 0 VRAM: 2199.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:


[0;93m2025-05-16 06:23:21.358165503 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-16 06:23:21.358220597 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m



🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 20.43 ms
📊 Total time: 204.34 ms | FPS: 48.9

--- Memory Usage (After GPU test) ---
CPU RAM used: 2695.33 MB
GPU 0 VRAM: 2475.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:


[0;93m2025-05-16 06:23:22.164885269 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-16 06:23:22.164914679 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m



🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 20.02 ms
📊 Total time: 200.24 ms | FPS: 49.9

--- Memory Usage (After AMP test) ---
CPU RAM used: 2695.45 MB
GPU 0 VRAM: 2475.00 MB / 16384.00 MB

📈 Results Summary:
| Device | Inference Time (ms) | Speedup vs CPU |
|--------|---------------------|----------------|
| CPU    |              105.49 |        —        |
| GPU    |               20.43 |       5.2      x |
| AMP    |               20.02 |       5.3      x |

📊 Loading ImageNetMini dataset...
Compose(
    Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(256, 256))
    MaybeToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)

🧮 Calculating metrics on GPU:


[0;93m2025-05-16 06:23:23.839902991 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-16 06:23:23.839934448 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m



🎯 Quality Metrics Summary:
| Device | Precision | Recall  | F1-Score |
|--------|-----------|---------|----------|
| GPU    | 0.8474  | 0.8342 | 0.8239  |


  _warn_prf(average, modifier, msg_start, len(result))


## Quantization with Optimum

In [15]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from onnxruntime.quantization import QuantFormat, QuantizationMode

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model)
qconfig = AutoQuantizationConfig.avx512_vnni(
    is_static=True,
    per_channel=True,
)

In [16]:
train_dataset = load_imagenet_mini(path, base_model, 'train')

Compose(
    Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(256, 256))
    MaybeToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)


In [17]:
from torch.utils.data import random_split
from datasets import Dataset

calibration_size = 1000
calibration_dataset, _ = random_split(train_dataset, [calibration_size, len(train_dataset) - calibration_size])
images_list = [image for image, _ in calibration_dataset]
optimum_dataset = Dataset.from_dict({
    "pixel_values": images_list,
})

In [18]:
from optimum.onnxruntime.configuration import AutoCalibrationConfig

calibration_config = AutoCalibrationConfig.percentiles(optimum_dataset, percentile=99.99)

# Perform the calibration step: computes the activations quantization ranges
batch_size = 10
shards = len(optimum_dataset) // batch_size

for i in range(shards):
    print(f'Processing {i + 1} shard')
    shard = optimum_dataset.shard(shards, i)
    quantizer.partial_fit(
        dataset=shard,
        calibration_config=calibration_config,
        operators_to_quantize=qconfig.operators_to_quantize,
        batch_size=batch_size,
        use_external_data_format=False,
    )
ranges = quantizer.compute_ranges()

# remove temp augmented model again
os.remove("augmented_model.onnx")

Processing 1 shard
Collecting tensor data and making histogram ...
Processing 2 shard
Collecting tensor data and making histogram ...
Processing 3 shard
Collecting tensor data and making histogram ...
Processing 4 shard
Collecting tensor data and making histogram ...
Processing 5 shard
Collecting tensor data and making histogram ...
Processing 6 shard
Collecting tensor data and making histogram ...
Processing 7 shard
Collecting tensor data and making histogram ...
Processing 8 shard
Collecting tensor data and making histogram ...
Processing 9 shard
Collecting tensor data and making histogram ...
Processing 10 shard
Collecting tensor data and making histogram ...
Processing 11 shard
Collecting tensor data and making histogram ...
Processing 12 shard
Collecting tensor data and making histogram ...
Processing 13 shard
Collecting tensor data and making histogram ...
Processing 14 shard
Collecting tensor data and making histogram ...
Processing 15 shard
Collecting tensor data and making his

In [19]:
from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
from optimum.onnxruntime.preprocessors.passes import (
    ExcludeGeLUNodes,
    ExcludeLayerNormNodes,
    ExcludeNodeAfter,
    ExcludeNodeFollowedBy,
)


def create_quantization_preprocessor():
    # Create a quantization preprocessor to determine the nodes to exclude
    quantization_preprocessor = QuantizationPreprocessor()

    # Exclude the nodes constituting LayerNorm
    quantization_preprocessor.register_pass(ExcludeLayerNormNodes())
    # Exclude the nodes constituting GELU
    quantization_preprocessor.register_pass(ExcludeGeLUNodes())
    # Exclude the residual connection Add nodes
    quantization_preprocessor.register_pass(ExcludeNodeAfter("Add", "Add"))
    # Exclude the Add nodes following the Gather operator
    quantization_preprocessor.register_pass(ExcludeNodeAfter("Gather", "Add"))
    # Exclude the Add nodes followed by the Softmax operator
    quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))

    return quantization_preprocessor


# create processor
quantization_preprocessor = create_quantization_preprocessor()

# Quantize the same way we did for dynamic quantization!
quantizer.quantize(
    save_dir=onnx_path,
    calibration_tensors_range=ranges,
    quantization_config=qconfig,
    preprocessor=quantization_preprocessor,
)

PosixPath('onnx')

In [20]:
import os

# get model file size
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
quantized_model = os.path.getsize(onnx_path / "model_quantized.onnx")/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")

Model file size: 185.87 MB
Quantized Model file size: 48.90 MB


In [21]:
model = ORTModelForImageClassification.from_pretrained(onnx_path, file_name="model_quantized.onnx")

You are using a model of type timm_wrapper to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
Too many ONNX model files were found in onnx/model.onnx ,onnx/model_quantized.onnx. specify which one to load by using the `file_name` and/or the `subfolder` arguments. Loading the file model_quantized.onnx in the subfolder onnx.


In [22]:
main(model, path)


🔍 Initial memory state:

--- Memory Usage (Before loading model) ---
CPU RAM used: 8642.28 MB
GPU 0 VRAM: 2205.00 MB / 16384.00 MB

🔍 Initial memory state:

--- Memory Usage (Model loaded) ---
CPU RAM used: 8642.28 MB
GPU 0 VRAM: 2205.00 MB / 16384.00 MB

🧪 Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 88.46 ms
📊 Total time: 884.65 ms | FPS: 11.3

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 8668.28 MB
GPU 0 VRAM: 2205.00 MB / 16384.00 MB

🎮 Benchmarking on GPU:


[0;93m2025-05-16 07:08:09.935517113 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 116 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-16 07:08:09.960226282 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-16 07:08:09.960249430 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m



🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 29.34 ms
📊 Total time: 293.44 ms | FPS: 34.1

--- Memory Usage (After GPU test) ---
CPU RAM used: 8643.50 MB
GPU 0 VRAM: 2347.00 MB / 16384.00 MB

⚡ Benchmarking with AMP:


[0;93m2025-05-16 07:08:11.224793031 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 116 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-16 07:08:11.248286262 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-16 07:08:11.248307229 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m



🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 28.77 ms
📊 Total time: 287.74 ms | FPS: 34.8

--- Memory Usage (After AMP test) ---
CPU RAM used: 8643.50 MB
GPU 0 VRAM: 2347.00 MB / 16384.00 MB

📈 Results Summary:
| Device | Inference Time (ms) | Speedup vs CPU |
|--------|---------------------|----------------|
| CPU    |               88.46 |        —        |
| GPU    |               29.34 |       3.0      x |
| AMP    |               28.77 |       3.1      x |

📊 Loading ImageNetMini dataset...
Compose(
    Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(256, 256))
    MaybeToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)

🧮 Calculating metrics on GPU:


[0;93m2025-05-16 07:08:13.849031148 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 116 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-16 07:08:13.872271522 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-16 07:08:13.872292008 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m



🎯 Quality Metrics Summary:
| Device | Precision | Recall  | F1-Score |
|--------|-----------|---------|----------|
| GPU    | 0.6113  | 0.4248 | 0.4623  |


  _warn_prf(average, modifier, msg_start, len(result))
