In [None]:
import torch
import time
import psutil
from tqdm import tqdm

from transformers import  WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
from collections import defaultdict
from evaluate import load

In [3]:
MODEL_NAME = "openai/whisper-small"  # 'openai/whisper-large-v2'

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model

cuda


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

### model size

In [4]:
model_size = sum(p.numel() for p in model.parameters()) * 4 / (1024 ** 2) 
print(f"Model size: {model_size} MB")

Model size: 922.1455078125 MB


### inference time

In [5]:
s = 30
sample_audio = torch.randn(1, 16000 * s).squeeze().numpy()
input_features = processor(sample_audio, sampling_rate=16000, return_tensors="pt").input_features
input_features

tensor([[[1.1975, 1.2233, 1.2506,  ..., 1.0828, 1.1040, 1.1924],
         [0.9256, 1.0169, 1.1573,  ..., 1.2372, 1.0638, 1.1089],
         [1.0905, 1.0836, 1.0123,  ..., 1.2328, 1.1342, 1.1447],
         ...,
         [1.1068, 1.1432, 1.1703,  ..., 1.1324, 1.1899, 1.0854],
         [1.1390, 1.1184, 1.2220,  ..., 1.0569, 1.1890, 1.1088],
         [1.1679, 1.1218, 1.1333,  ..., 1.0861, 1.1049, 1.1165]]])

In [6]:
def measure_inference_time(model, input_tensor, device):
    model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Замеряем время
    start_time = time.time()
    with torch.no_grad():
        output = model.generate(input_tensor, return_timestamps=True)[0]
    processor.decode(output)

    elapsed_time = (time.time() - start_time) * 1000  # Время в миллисекундах
    return round(elapsed_time, 2)

# Запуск теста на CPU и GPU
cpu_time = measure_inference_time(model, input_features.cpu(), "cpu")
gpu_time = measure_inference_time(model, input_features, "cuda") if torch.cuda.is_available() else "N/A"

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [7]:
cpu_time / s, gpu_time / s

(377.64500000000004, 156.31666666666666)

### Замер использования RAM

In [8]:
ram_usage = round(psutil.Process().memory_info().rss / (1024 ** 2), 2)  # В MB
ram_usage

2385.38

### Замер использования VRAM

In [9]:
vram_usage = round(torch.cuda.memory_allocated() / (1024 ** 2), 2) if torch.cuda.is_available() else "N/A"
vram_usage

931.01

### Оценка качества (CER, WER)

In [15]:
dataset = load_dataset("librispeech_asr", "clean", split="test")
# dataset = load_dataset("librispeech_asr", "test.clean")

Generating train.100 split: 100%|██████████| 28539/28539 [00:26<00:00, 1079.19 examples/s]
Generating train.360 split: 100%|██████████| 104014/104014 [01:33<00:00, 1116.93 examples/s]
Generating validation split: 100%|██████████| 2703/2703 [00:01<00:00, 1812.84 examples/s]
Generating test split: 100%|██████████| 2620/2620 [00:01<00:00, 1645.37 examples/s]


In [27]:
wer = load("wer")
cer = load("cer")

def predict(batch, model):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cuda"))[0]
    transcription = processor.decode(predicted_ids)
    return processor.tokenizer._normalize(transcription)

In [28]:
NUM_EXAMPLES = 64
res = defaultdict(list)

for el in tqdm(dataset.select(range(NUM_EXAMPLES))):
    res["reference"].append(processor.tokenizer._normalize(el['text']))
    res["prediction"].append(predict(el, model))

cer_res = 100 * cer.compute(references=res["reference"], predictions=res["prediction"])
wer_res = 100 * wer.compute(references=res["reference"], predictions=res["prediction"])

100%|██████████| 64/64 [00:23<00:00,  2.68it/s]


In [30]:
# Вывод результатов
results = {
    "Модель ": MODEL_NAME,
    "Метод ": "Whisper",
    "Размер весов (MB) ": round(model_size, 2),
    "Время инференса (CPU, ms) ": cpu_time / s,
    "Время инференса (GPU, ms) ": gpu_time / s,
    "Использование RAM (MB) ": ram_usage,
    "Использование VRAM (MB) ": vram_usage,
    "Качество ": {
        "CER% ": cer_res,
        "WER% ": wer_res,
    }
}

results

{'Модель ': 'openai/whisper-small',
 'Метод ': 'Whisper',
 'Размер весов (MB) ': 922.15,
 'Время инференса (CPU, ms) ': 377.64500000000004,
 'Время инференса (GPU, ms) ': 156.31666666666666,
 'Использование RAM (MB) ': 2385.38,
 'Использование VRAM (MB) ': 931.01,
 'Качество ': {'CER% ': 1.3379017784304128, 'WER% ': 3.829787234042553}}