In [None]:
import torch
import time
import psutil
import tqdm
import gc

from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_metric, load_dataset
from collections import defaultdict
from evaluate import load

In [None]:
# Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

# processor = AutoProcessor.from_pretrained("openai/whisper-small")
# model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
# model.generate

In [None]:
MODEL_NAME = "openai/whisper-small"  # 'openai/whisper-large-v2'

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model

### model size

In [None]:
model_size = sum(p.numel() for p in model.parameters()) * 4 / (1024 ** 2) 
print(f"Model size: {model_size} MB")

### inference time

In [None]:
s = 30
sample_audio = torch.randn(1, 16000 * s).squeeze().numpy()
input_features = processor(sample_audio, sampling_rate=16000, return_tensors="pt").input_features
input_features

In [None]:
def measure_inference_time(model, input_tensor, device):
    model.to(device)
    input_tensor = input_tensor.to(device)
    
    # Очистка кэша
    torch.cuda.empty_cache()
    gc.collect()

    # Замеряем время
    start_time = time.time()
    with torch.no_grad():
        output = model.generate(input_tensor, return_timestamps=True)
    processor.decode(output)

    elapsed_time = (time.time() - start_time) * 1000  # Время в миллисекундах
    return round(elapsed_time, 2)

# Запуск теста на CPU и GPU
cpu_time = measure_inference_time(model, input_features.cpu(), "cpu")
gpu_time = measure_inference_time(model, input_features, "cuda") if torch.cuda.is_available() else "N/A"

In [None]:
cpu_time / s, gpu_time / s

### Замер использования RAM

In [None]:
ram_usage = round(psutil.Process().memory_info().rss / (1024 ** 2), 2)  # В MB
ram_usage

### Замер использования VRAM

In [None]:
vram_usage = round(torch.cuda.memory_allocated() / (1024 ** 2), 2) if torch.cuda.is_available() else "N/A"
vram_usage

### Оценка качества (CER, WER)

In [None]:
dataset = load_dataset("librispeech_asr", "clean", split="test")

In [None]:
wer = load("wer")
cer = load("cer")

def predict(batch, model):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cpu"))[0]
    transcription = processor.decode(predicted_ids)
    return processor.tokenizer._normalize(transcription)

In [None]:
NUM_EXAMPLES = 64
res = defaultdict(list)

for el in tqdm(dataset[:NUM_EXAMPLES]):
    res["reference"].append(processor.tokenizer._normalize(el['text']))
    res["prediction"].append(predict(el, model))

cer_res = 100 * cer.compute(references=res["reference"], predictions=res["prediction"])
wer_res = 100 * wer.compute(references=res["reference"], predictions=res["prediction"])

In [None]:
# Вывод результатов
results = {
    "Модель ": MODEL_NAME,
    "Метод ": "Whisper",
    "Размер весов (MB) ": round(model_size, 2),
    "Время инференса (CPU, ms) ": cpu_time,
    "Время инференса (GPU, ms) ": gpu_time,
    "Использование RAM (MB) ": ram_usage,
    "Использование VRAM (MB) ": vram_usage,
    "Качество ": {
        "CER% ": cer_res,
        "WER% ": wer_res,
    }
}

print(results)