In [1]:
!pip install gputil

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=9f5a3ca7b72a57d0f3db23f00606b24aa628e53a28150c9967f4817ac7d61aa7
  Stored in directory: /root/.cache/pip/wheels/2b/4d/8f/55fb4f7b9b591891e8d3f72977c4ec6c7763b39c19f0861595
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [2]:
!pip uninstall onnxruntime -y

[0m

In [3]:
!pip uninstall onnxruntime-gpu -y

Found existing installation: onnxruntime-gpu 1.21.1
Uninstalling onnxruntime-gpu-1.21.1:
  Successfully uninstalled onnxruntime-gpu-1.21.1


In [None]:
!pip install optimum[onnxruntime-gpu]

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ifigotin/imagenetmini-1000")
path += "/imagenet-mini"

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imagenetmini-1000/imagenet-mini


In [4]:
import torch
import onnxruntime as ort
import onnx
import time
import timm
import psutil
import os
import GPUtil
import numpy as np
from torch.amp import autocast
from torchvision import datasets, transforms
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from sklearn.metrics import precision_score, recall_score, f1_score

def print_memory_usage(label=""):
    """Выводит использование памяти в мегабайтах"""
    if label:
        print(f"\n--- Memory Usage ({label}) ---")
    else:
        print("\n--- Memory Usage ---")
        
    # CPU RAM в MB
    process = psutil.Process(os.getpid())
    ram_used = process.memory_info().rss / (1024 ** 2)
    print(f"CPU RAM used: {ram_used:.2f} MB")
    
    # GPU VRAM в MB
    gpus = GPUtil.getGPUs()
    for gpu in gpus:
        vram_used = gpu.memoryUsed
        vram_total = gpu.memoryTotal
        print(f"GPU {gpu.id} VRAM: {vram_used:.2f} MB / {vram_total:.2f} MB")

def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    return (param_size + buffer_size) / 1024**2

def calculate_metrics(model, device, data_loader):
    model.eval()
    all_preds = []
    all_targets = []

    model.to(device)
    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_targets.extend(targets.numpy())
    
    precision = precision_score(all_targets, all_preds, average='macro')
    recall = recall_score(all_targets, all_preds, average='macro')
    f1 = f1_score(all_targets, all_preds, average='macro')
    
    return precision, recall, f1

def load_imagenet_mini(dataset_path, model):
    # Создаем трансформы на основе модели
    config = resolve_data_config({}, model=model)
    transform = create_transform(**config)
    
    # Загружаем датасет
    dataset = datasets.ImageFolder(
        root=os.path.join(dataset_path, 'val'),
        transform=transform
    )
    
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=64,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    
    return data_loader

def benchmark_model(model, device, input_tensor, num_runs=10, warmup=3, use_amp=False):
    model = model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Warmup
    print(f"\n🔥 Warming up ({warmup} runs) on {device}...")
    for _ in range(warmup):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    # Benchmark
    print(f"🚀 Benchmarking ({num_runs} runs) on {device}...")
    start_time = time.time()
    
    for _ in range(num_runs):
        with torch.no_grad():
            if use_amp and device.type == 'cuda':
                with autocast(device_type='cuda', dtype=torch.float16):
                    _ = model(input_tensor)
            else:
                _ = model(input_tensor)
    
    total_time = (time.time() - start_time) * 1000
    avg_time = total_time / num_runs
    print(f"✅ Average inference: {avg_time:.2f} ms")
    print(f"📊 Total time: {total_time:.2f} ms | FPS: {1000/(avg_time + 1e-9):.1f}")
    
    return avg_time

In [5]:
model = timm.create_model('efficientvit_b3.r256_in1k', pretrained=True).eval()

model.safetensors:   0%|          | 0.00/195M [00:00<?, ?B/s]

# Baseline

In [7]:
print(f"📏 Model size: {get_model_size(model):.2f} MB")

📏 Model size: 185.75 MB


In [8]:
print_memory_usage("Model loaded")


--- Memory Usage (Model loaded) ---
CPU RAM used: 932.78 MB
GPU 0 VRAM: 0.00 MB / 16384.00 MB


In [9]:
device_cpu = torch.device('cpu')
device_gpu = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
input_tensor = torch.randn(1, 3, 224, 224)

In [10]:
print("\nBenchmarking on CPU:")
cpu_time = benchmark_model(model, device_cpu, input_tensor)
print_memory_usage("After CPU benchmark")


Benchmarking on CPU:

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 122.18 ms
📊 Total time: 1221.78 ms | FPS: 8.2

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 975.90 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB


In [11]:
gpu_time = benchmark_model(model, device_gpu, input_tensor)
print_memory_usage("After GPU test")


🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 22.45 ms
📊 Total time: 224.53 ms | FPS: 44.5

--- Memory Usage (After GPU test) ---
CPU RAM used: 1098.04 MB
GPU 0 VRAM: 539.00 MB / 16384.00 MB


In [12]:
gpu_amp_time = benchmark_model(model, device_gpu, input_tensor, use_amp=True)
print_memory_usage("After AMP test")


🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 27.16 ms
📊 Total time: 271.59 ms | FPS: 36.8

--- Memory Usage (After AMP test) ---
CPU RAM used: 1116.79 MB
GPU 0 VRAM: 615.00 MB / 16384.00 MB


In [6]:
data_loader = load_imagenet_mini(path, model)

In [14]:
precision_gpu, recall_gpu, f1_gpu = calculate_metrics(model, device_gpu, data_loader)
print(f"| {precision_gpu:.4f}  | {recall_gpu:.4f} | {f1_gpu:.4f}  |")

| 0.8474  | 0.8342 | 0.8239  |


  _warn_prf(average, modifier, msg_start, len(result))


# Torchscript

In [10]:
model = model.to(device_cpu)

In [13]:
traiced_model = torch.jit.trace(model, input_tensor)

In [14]:
torch.jit.save(traiced_model, "traiced_model.pt")

In [15]:
print_memory_usage("Before model loaded")
model_ts = torch.jit.load("traiced_model.pt")
print_memory_usage("After model loaded")


--- Memory Usage (Before model loaded) ---
CPU RAM used: 1291.32 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

--- Memory Usage (After model loaded) ---
CPU RAM used: 1366.32 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB


In [16]:
print(f"📏 Model size: {get_model_size(traiced_model):.2f} MB")
cpu_time = benchmark_model(traiced_model, device_cpu, input_tensor)
print_memory_usage("After CPU benchmark")
gpu_time = benchmark_model(traiced_model, device_gpu, input_tensor)
print_memory_usage("After GPU test")
gpu_amp_time = benchmark_model(traiced_model, device_gpu, input_tensor, use_amp=True)
print_memory_usage("After AMP test")
precision_gpu, recall_gpu, f1_gpu = calculate_metrics(traiced_model, device_gpu, data_loader)
print(f"| {precision_gpu:.4f}  | {recall_gpu:.4f} | {f1_gpu:.4f}  |")

📏 Model size: 185.75 MB

🔥 Warming up (3 runs) on cpu...
🚀 Benchmarking (10 runs) on cpu...
✅ Average inference: 100.95 ms
📊 Total time: 1009.47 ms | FPS: 9.9

--- Memory Usage (After CPU benchmark) ---
CPU RAM used: 1390.82 MB
GPU 0 VRAM: 3.00 MB / 16384.00 MB

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 15.92 ms
📊 Total time: 159.17 ms | FPS: 62.8

--- Memory Usage (After GPU test) ---
CPU RAM used: 1481.03 MB
GPU 0 VRAM: 537.00 MB / 16384.00 MB

🔥 Warming up (3 runs) on cuda...
🚀 Benchmarking (10 runs) on cuda...
✅ Average inference: 19.36 ms
📊 Total time: 193.58 ms | FPS: 51.7

--- Memory Usage (After AMP test) ---
CPU RAM used: 1601.78 MB
GPU 0 VRAM: 537.00 MB / 16384.00 MB
| 0.8474  | 0.8342 | 0.8239  |


  _warn_prf(average, modifier, msg_start, len(result))


# ONNX

In [18]:
opset_version = max(int(k[14:]) for k in vars(torch.onnx) if "symbolic_opset" in k) - 1

In [19]:
torch.onnx.export(
    model.to(device_cpu), input_tensor, "model.onnx",
    export_params=True,
    opset_version=opset_version,
    do_constant_folding=True,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={
        "input":  {0: "batch_size", 2: "height", 3: "width"},
        "output": {0: "batch_size"},
    },
)

  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


In [20]:
model_onnx = onnx.load("model.onnx")

In [21]:
def get_onnx_model_size(path: str) -> float:
    size_bytes = os.path.getsize(path)
    return size_bytes / (1024 ** 2)
def load_onnx_session(path: str, use_gpu: bool = True) -> ort.InferenceSession:
    providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
    return ort.InferenceSession(path, providers=providers)

In [22]:
def benchmark_onnx(session, input_array: np.ndarray, num_runs=10, warmup=3):
    input_name = session.get_inputs()[0].name
    # warmup
    for _ in range(warmup):
        session.run(None, {input_name: input_array})

    start = time.time()
    for _ in range(num_runs):
        session.run(None, {input_name: input_array})
    return (time.time() - start) * 1000 / num_runs  # ms

In [23]:
def calculate_metrics_onnx(session, data_loader):
    all_preds, all_targets = [], []
    in_name = session.get_inputs()[0].name
    for images, targets in data_loader:
        arr = images.cpu().numpy()
        out = session.run(None, {in_name: arr})[0]
        preds = out.argmax(axis=1)
        all_preds.extend(preds)
        all_targets.extend(targets.numpy())
    return (
        precision_score(all_targets, all_preds, average='macro'),
        recall_score(all_targets, all_preds, average='macro'),
        f1_score(all_targets, all_preds, average='macro'),
    )

In [25]:
onnx_path = "model.onnx"
print(f"📏 ONNX model size: {get_onnx_model_size(onnx_path):.2f} MB")
print_memory_usage("Before ONNX load")

sess_cpu = load_onnx_session(onnx_path, use_gpu=False)
print_memory_usage("ONNX session (CPU) loaded")
cpu_time = benchmark_onnx(sess_cpu, input_tensor.cpu().numpy())
print(f"✅ CPU inference avg: {cpu_time:.2f} ms")
print_memory_usage("After ONNX CPU benchmark")

📏 ONNX model size: 185.87 MB

--- Memory Usage (Before ONNX load) ---
CPU RAM used: 2590.98 MB
GPU 0 VRAM: 2201.00 MB / 16384.00 MB

--- Memory Usage (ONNX session (CPU) loaded) ---
CPU RAM used: 2776.54 MB
GPU 0 VRAM: 2201.00 MB / 16384.00 MB
✅ CPU inference avg: 74.61 ms

--- Memory Usage (After ONNX CPU benchmark) ---
CPU RAM used: 2776.54 MB
GPU 0 VRAM: 2201.00 MB / 16384.00 MB


In [26]:
sess_gpu = load_onnx_session(onnx_path, use_gpu=True)
print_memory_usage("ONNX session (GPU) loaded")
gpu_time = benchmark_onnx(sess_gpu, input_tensor.cpu().numpy())
print(f"✅ GPU inference avg: {gpu_time:.2f} ms")
print_memory_usage("After ONNX GPU benchmark")


--- Memory Usage (ONNX session (GPU) loaded) ---
CPU RAM used: 2791.92 MB
GPU 0 VRAM: 2469.00 MB / 16384.00 MB


[0;93m2025-04-24 11:25:29.863311055 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 30 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m


✅ GPU inference avg: 19.50 ms

--- Memory Usage (After ONNX GPU benchmark) ---
CPU RAM used: 2807.17 MB
GPU 0 VRAM: 2479.00 MB / 16384.00 MB


In [27]:
precision_onnx, recall_onnx, f1_onnx = calculate_metrics_onnx(sess_gpu, data_loader)
print(f"| {precision_onnx:.4f} | {recall_onnx:.4f} | {f1_onnx:.4f} |")

| 0.8474 | 0.8342 | 0.8239 |


  _warn_prf(average, modifier, msg_start, len(result))


# OpenVINO

In [None]:
!pip install openvino-dev[onnx]

In [81]:
from openvino.tools.mo import convert_model

In [82]:
from openvino.runtime import serialize
from openvino.runtime import Core

In [83]:
converted = convert_model(
    "model.onnx",
    output_dir="openvino_model",
    input_shape=[1, 3, 224, 224],
    compress_to_fp16=False
)

[ INFO ] MO command line tool is considered as the legacy conversion API as of OpenVINO 2023.2 release.
In 2025.0 MO command line tool and openvino.tools.mo.convert_model() will be removed. Please use OpenVINO Model Converter (OVC) or openvino.convert_model(). OVC represents a lightweight alternative of MO and provides simplified model conversion API. 
Find more information about transition from MO to OVC at https://docs.openvino.ai/2023.2/openvino_docs_OV_Converter_UG_prepare_model_convert_model_MO_OVC_transition.html


In [84]:
serialize(
    converted,
    "openvino_model/model.xml",
    "openvino_model/model.bin"
)

In [85]:
!ls ./openvino_model

model.bin  model.xml  openvino_model


In [86]:
def print_memory_usage(label=""):
    if label:
        print(f"\n--- Memory Usage ({label}) ---")
    else:
        print("\n--- Memory Usage ---")
    process = psutil.Process(os.getpid())
    ram_used = process.memory_info().rss / (1024 ** 2)
    print(f"CPU RAM used: {ram_used:.2f} MB")

def get_openvino_model_size(ir_dir: str) -> float:
    xml = os.path.join(ir_dir, "model.xml")
    binf = os.path.join(ir_dir, "model.bin")
    total = os.path.getsize(xml) + os.path.getsize(binf)
    return total / (1024 ** 2)

def load_openvino_model(ir_dir: str, device: str = "CPU"):
    """
    Создаёт и компилирует OpenVINO-модель.
    """
    ie = Core()
    model = ie.read_model(model=os.path.join(ir_dir, "model.xml"))
    compiled = ie.compile_model(model=model, device_name=device)
    input_name = compiled.input(0).get_any_name()
    output_name = compiled.output(0).get_any_name()
    return compiled, input_name, output_name

def benchmark_openvino(compiled, input_name: str, input_np: np.ndarray,
                      num_runs: int = 10, warmup: int = 3) -> float:
    """Среднее время инференса в мс."""
    # warmup
    for _ in range(warmup):
        compiled([input_np])

    start = time.time()
    for _ in range(num_runs):
        compiled([input_np])
    avg_ms = (time.time() - start) * 1000 / num_runs
    return avg_ms

def calculate_metrics_openvino(compiled, input_name: str, output_name: str,
                               data_loader) -> tuple:
    """Precision, recall, f1 на валидационной выборке."""
    all_preds, all_targets = [], []
    for images, targets in data_loader:
        # Приводим батч к numpy и делаем инференс
        inp = images.cpu().numpy()
        res = compiled([inp])[output_name]
        preds = np.argmax(res, axis=1)
        all_preds.extend(preds)
        all_targets.extend(targets.numpy())
    return (
        precision_score(all_targets, all_preds, average='macro'),
        recall_score(all_targets, all_preds, average='macro'),
        f1_score(all_targets, all_preds, average='macro'),
    )

In [87]:
ir_dir = "./openvino_model"
input_np = input_tensor.cpu().numpy()

In [88]:
size_mb = get_openvino_model_size(ir_dir)
print(f"📏 OpenVINO IR size: {size_mb:.2f} MB")

📏 OpenVINO IR size: 186.31 MB


In [89]:
print_memory_usage("before load")
compiled_cpu, in_name, out_name = load_openvino_model(ir_dir, device="CPU")
print_memory_usage("after load (CPU)")


--- Memory Usage (before load) ---
CPU RAM used: 4284.18 MB

--- Memory Usage (after load (CPU)) ---
CPU RAM used: 4675.43 MB


In [90]:
cpu_ms = benchmark_openvino(compiled_cpu, in_name, input_np)
print(f"✅ CPU inference avg: {cpu_ms:.2f} ms")
print_memory_usage("after CPU benchmark")

✅ CPU inference avg: 50.52 ms

--- Memory Usage (after CPU benchmark) ---
CPU RAM used: 4684.55 MB


In [91]:
compiled_gpu, in_name, out_name = load_openvino_model(ir_dir, device="GPU")
print_memory_usage("after load (GPU)")
gpu_ms = benchmark_openvino(compiled_gpu, in_name, input_np)
print(f"✅ GPU inference avg: {gpu_ms:.2f} ms")
print_memory_usage("after GPU benchmark")

RuntimeError: Exception from src/inference/src/cpp/core.cpp:104:
Check '!m_device_map.empty()' failed at src/plugins/intel_gpu/src/plugin/plugin.cpp:419:
[GPU] Can't get PERFORMANCE_HINT property as no supported devices found or an error happened during devices query.
[GPU] Please check OpenVINO documentation for GPU drivers setup guide.




In [95]:
precision, recall, f1 = calculate_metrics_openvino(compiled_cpu, in_name, out_name, data_loader)
print(f"| {precision:.4f} | {recall:.4f} | {f1:.4f} |")

| 0.8474 | 0.8342 | 0.8239 |
