In [1]:
import torch
import re
import time
import psutil
import os
import numpy as np
from datasets import load_dataset
from itertools import islice
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
from evaluate import load as load_metric
from transformers.modeling_outputs import BaseModelOutput
from openvino.frontend import FrontEndManager
from tqdm import tqdm
from openvino.runtime import serialize

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm







In [2]:
try:
    import pynvml
    pynvml.nvmlInit()
    nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    use_nvml = True
except Exception:
    nvml_handle = None
    use_nvml = False

proc = psutil.Process(os.getpid())

In [3]:
wer = load_metric("wer")
cer = load_metric("cer")

def normalize(text: str) -> str:
    t = text.lower().strip()
    t = re.sub(r"[^\w\s]", "", t)
    t = re.sub(r"\s+", " ", t)
    return t

In [4]:
NUM_EX = 5
stream = load_dataset(
    "librispeech_asr", 
    "clean", 
    split="test", 
    streaming=True, 
    trust_remote_code=True
)
dataset = list(islice(stream, NUM_EX))

# Метрики на модели без конвертации

In [5]:
MODEL_NAME = "openai/whisper-small"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

Device: cuda


In [6]:
processor = WhisperProcessor.from_pretrained(MODEL_NAME)

def load_model(device=DEVICE):
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    return model

In [7]:
def predict_example(model, batch, device="cpu"):

    audio = batch["audio"]
    input_feats = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_features

    # измеряем время инференса
    start = time.time()
    with torch.no_grad():
        ids = model.generate(input_feats.to(device))[0]
    elapsed_ms = (time.time() - start) * 1000

    # декодируем и нормализуем текст
    raw = processor.decode(ids)
    text = processor.tokenizer._normalize(raw)

    return text, elapsed_ms

In [8]:
# CPU
proc = psutil.Process(os.getpid())

baseline_rss = proc.memory_info().rss
model_orig = load_model('cpu')

rss_after_cpu = proc.memory_info().rss
total_ram_cpu_mb = rss_after_cpu / (1024**2)

print(f"CPU модель, RAM зянаято: {total_ram_cpu_mb:.1f} MB", end='')

orig_refs, orig_preds, orig_times = [], [], []

for ex in tqdm(dataset, desc="инференс оригинальной модели (CPU)"):
    pred, t_ms = predict_example(model_orig, ex, device='cpu')
    orig_refs.append(processor.tokenizer._normalize(ex['text']))
    orig_preds.append(pred)
    orig_times.append(t_ms)


avg_time = np.mean(orig_times)
orig_wer = wer.compute(references=orig_refs, predictions=orig_preds) * 100
orig_cer = cer.compute(references=orig_refs, predictions=orig_preds) * 100

print(f"CPU: avg time {avg_time:.2f} ms | WER {orig_wer:.2f}% | CER {orig_cer:.2f}%")

CPU модель, RAM зянаято: 1105.8 MB

инференс оригинальной модели (CPU):   0%|          | 0/5 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
инференс оригинальной модели (CPU): 100%|██████████| 5/5 [00:21<00:00,  4.24s/it]

CPU: avg time 4218.02 ms | WER 1.27% | CER 1.03%





In [9]:
# GPU
if DEVICE == 'cuda':
    ram_cpu_before = proc.memory_info().rss
    torch.cuda.reset_peak_memory_stats()
    model_orig_gpu = load_model('cuda').to('cuda')
    ram_cpu_after = proc.memory_info().rss
    gpu_peak_load = torch.cuda.max_memory_allocated()
    ram_cpu_load = (ram_cpu_after - ram_cpu_before) / (1024**2)
    print(f"GPU модель, VRAM занято: {gpu_peak_load/1024**2:.2f} MB")

    gpu_refs, gpu_preds, gpu_times = [], [], []

    for ex in tqdm(dataset, desc="инференс оригинальной модели (GPU)"):
        pred, t_ms = predict_example(model_orig_gpu, ex, device=DEVICE)
        gpu_refs.append(processor.tokenizer._normalize(ex['text']))
        gpu_preds.append(pred)
        gpu_times.append(t_ms)

    avg_time_gpu = np.mean(gpu_times)
    gpu_wer = wer.compute(references=gpu_refs, predictions=gpu_preds) * 100
    gpu_cer = cer.compute(references=gpu_refs, predictions=gpu_preds) * 100

    print(f"GPU: avg time {avg_time_gpu:.2f} ms | WER {gpu_wer:.2f}% | CER {gpu_cer:.2f}%")

GPU модель, VRAM занято: 922.88 MB


инференс оригинальной модели (GPU): 100%|██████████| 5/5 [00:03<00:00,  1.38it/s]

GPU: avg time 705.48 ms | WER 1.27% | CER 1.03%





# ONNX (cpu)

### экспорт одним файлом

In [10]:
# !optimum-cli export onnx \
#   --model openai/whisper-small \
#   --task automatic-speech-recognition \
#   --feature_size 80 \
#   --audio_sequence_length 3000 \
#   --monolith \
#   whisper_onnx

### экспорт несколькими файлами

In [11]:
# !optimum-cli export onnx \
#   --model openai/whisper-small \
#   --task automatic-speech-recognition \
#   --feature_size 80 \
#   --audio_sequence_length 3000 \
#   whisper_onnx

In [12]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
rss0 = proc.memory_info().rss
ort_whisper = ORTModelForSpeechSeq2Seq.from_pretrained(
    "whisper_onnx_parts",
    provider="CPUExecutionProvider",
    use_cache=False,
    use_past_format=False
)
rss1 = proc.memory_info().rss
print(f"Модель заняла {(rss1-rss0)/1024**2:.1f} MB в памяти")

Модель заняла 2084.8 MB в памяти


In [13]:
def infer_onnx(dataset, device):
    preds, refs = [], []
    times = []

    for sample in tqdm(dataset, desc=f"ONNX inference on {device}"):
        inputs = processor.feature_extractor(
            sample["audio"]["array"],
            sampling_rate=16_000,
            return_tensors="pt"
        ).to(device)

        t0 = time.time()
        gen_ids = ort_whisper.generate(
            input_features=inputs.input_features,
            max_length=448,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        t1 = time.time()

        text = processor.batch_decode(gen_ids, skip_special_tokens=True)[0].lower().strip()

        preds.append(text)
        refs.append(sample["text"].lower().strip())
        times.append(t1 - t0)

    avg_time = sum(times) / len(times)
    print(f"Среднее время инференса {device}: {avg_time:.3f} s")

    return preds, refs

In [14]:
preds, refs = infer_onnx(dataset, "cpu")

refs_n  = [normalize(r) for r in refs]
preds_n = [normalize(p) for p in preds]

wer_score = wer.compute(predictions=preds_n, references=refs_n)
cer_score = cer.compute(predictions=preds_n, references=refs_n)

print(f"ONNX cpu: {wer_score:.2%}, CER: {cer_score:.2%}")

ONNX inference on cpu: 100%|██████████| 5/5 [00:46<00:00,  9.40s/it]

Среднее время инференса cpu: 9.368 s
ONNX cpu: 1.27%, CER: 1.02%





# ONNX (gpu)

In [15]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
ort_whisper = ORTModelForSpeechSeq2Seq.from_pretrained(
    "whisper_onnx_parts",
    provider="CUDAExecutionProvider",
    use_cache=False,
    use_past_format=False,
    use_io_binding=False
)

In [16]:
pynvml.nvmlInit()
gpu_index = 0
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

mem_before = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).used
ort_whisper = ORTModelForSpeechSeq2Seq.from_pretrained(
    "whisper_onnx_parts",
    provider="CUDAExecutionProvider",
    use_cache=False,
    use_past_format=False,
    use_io_binding=False
)
mem_after = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).used

print(f"VRAM занято моделью ONNX Runtime: {(mem_after - mem_before) / 1024**2:.2f} MB")

VRAM занято моделью ONNX Runtime: 1681.09 MB


In [17]:
preds, refs = infer_onnx(dataset, "cuda")

refs_n  = [normalize(r) for r in refs]
preds_n = [normalize(p) for p in preds]

wer_score = wer.compute(predictions=preds_n, references=refs_n)
cer_score = cer.compute(predictions=preds_n, references=refs_n)

print(f"ONNX gpu: {wer_score:.2%}, CER: {cer_score:.2%}")

ONNX inference on cuda: 100%|██████████| 5/5 [00:03<00:00,  1.39it/s]

Среднее время инференса cuda: 0.696 s
ONNX gpu: 1.27%, CER: 1.02%





# OpenVINO

В OpenVINO не получилось экспртировть всю модль Whisper, только encoder

Так же нет прямой поддержки CUDA, поэтому было принято решение провести тесты только на CPU

In [18]:
TARGET_SR = 16000

BASE_DIR = os.getcwd()
ONNX_DIR_TMP = os.path.join(BASE_DIR, "onnx_encoder") # экспорт в openvino через onnx
IR_DIR = os.path.join(BASE_DIR, "openvino_encoder")
os.makedirs(ONNX_DIR_TMP, exist_ok=True)
os.makedirs(IR_DIR, exist_ok=True)

model_name = "openai/whisper-small"
model = WhisperForConditionalGeneration.from_pretrained(model_name).cpu()
processor = WhisperProcessor.from_pretrained(model_name)
encoder = model.model.encoder
n_mels = processor.feature_extractor.feature_size
n_audio = model.config.max_source_positions
seq_len = 2 * n_audio  

In [19]:
def export_encoder_to_onnx(encoder, n_mels, seq_len=3000):
    encoder.eval()
    dummy_input = torch.zeros((1, n_mels, seq_len))
    onnx_path = os.path.join(ONNX_DIR_TMP, "whisper_encoder.onnx")

    torch.onnx.export(
        encoder,
        dummy_input,
        onnx_path,
        export_params=True,
        opset_version=17,
        input_names=["mel"],
        output_names=["output_features"],
        dynamic_axes={
            "mel": {2: "seq_len"},
            "output_features": {2: "seq_len"}
        }
    )
    return onnx_path

onnx_model_path = export_encoder_to_onnx(encoder, n_mels)

In [20]:
def convert_onnx_to_openvino(onnx_path):
    xml_path = os.path.join(IR_DIR, "whisper_encoder.xml")

    fem = FrontEndManager()
    onnx_fe = fem.load_by_framework("onnx")
    onnx_mod = onnx_fe.load(onnx_path)
    ov_model = onnx_fe.convert(onnx_mod)

    serialize(ov_model, xml_path=xml_path)

convert_onnx_to_openvino(onnx_model_path)

In [21]:
from openvino.runtime import Core

core = Core()
ov_model = core.read_model(model=os.path.join(IR_DIR, "whisper_encoder.xml"))
ram_ov_before = proc.memory_info().rss / (1024**2)
compiled_ov = core.compile_model(model=ov_model, device_name="CPU")
ram_ov_after = proc.memory_info().rss / (1024**2)
print(f"OpenVINO CPU, RAM занято: {ram_ov_after - ram_ov_before:.2f} MB")

OpenVINO CPU, RAM занято: 698.43 MB


In [22]:
def prepare_mel(feature_extractor, audio_array, seq_len):
    inputs = feature_extractor(
        audio_array,
        sampling_rate=TARGET_SR,
        return_tensors="np"
    )
    mel = inputs.input_features  # (1, n_mels, T)
    B, M, T = mel.shape

    if T > seq_len:
        mel = mel[:, :, :seq_len]
    elif T < seq_len:
        pad_width = seq_len - T
        mel = np.pad(mel, ((0,0),(0,0),(0,pad_width)), constant_values=0.0)
    return mel.astype(np.float32)  # (1, M, seq_len)

In [23]:
def infer_openvino(compiled_model, mel: np.ndarray) -> np.ndarray:
    result = compiled_model([mel])
    output_tensor = result[compiled_model.output(0)]
    return output_tensor

In [24]:
from onnxruntime import InferenceSession
from openvino.runtime import Core

ort_sess = InferenceSession(onnx_model_path)
core = Core()
ov_model = core.read_model(model=os.path.join(IR_DIR, "whisper_encoder.xml"))
compiled_ov = core.compile_model(model=ov_model, device_name="CPU")

In [25]:
model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [26]:
device = "cpu"

In [27]:
preds_ov = []
refs = [sample["text"].lower().strip() for sample in dataset]
times = []

for sample in tqdm(dataset):
    t0 = time.time()

    mel = prepare_mel(processor.feature_extractor, sample["audio"]["array"], seq_len)
    audio_feats = infer_openvino(compiled_ov, mel)
    hs = torch.from_numpy(audio_feats).permute(0, 1, 2).to(device)
    encoder_outputs = BaseModelOutput(last_hidden_state=hs)
    gen_ids = model.generate(
        encoder_outputs=encoder_outputs,
        max_length=448,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    text = processor.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0].lower().strip()
    preds_ov.append(text)

    t1 = time.time()
    times.append(t1 - t0)

avg_time_ms = np.mean(times) * 1000
print(f"Среднее время инференса: {avg_time_ms:.2f} ms")

preds_norm = [normalize(t) for t in preds_ov]
refs_norm  = [normalize(t) for t in refs]
wer_ov = wer.compute(predictions=preds_norm, references=refs_norm)
cer_ov = cer.compute(predictions=preds_norm, references=refs_norm)
print(f"OpenVINO CPU метрики: {wer_ov:.2%}, CER: {cer_ov:.2%}")

100%|██████████| 5/5 [00:15<00:00,  3.18s/it]

Среднее время инференса: 3175.36 ms
OpenVINO CPU метрики: 1.27%, CER: 1.02%



