In [3]:
import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor 
import time
import psutil
from datasets import load_dataset, Audio
import evaluate
import os
import gc
from tqdm.auto import tqdm
import copy

# --- Конфигурация (основные параметры остаются) ---
MODEL_ID = "openai/whisper-large-v3"
DATASET_ID = "mozilla-foundation/common_voice_16_1"
DATASET_NAME = "ru"
DATASET_SPLIT = "test"
CALIBRATION_SPLIT = "train" 
NUM_SAMPLES_FOR_QUALITY_TEST = 50
NUM_SAMPLES_FOR_TIME_TEST = 10
NUM_WARMUP_RUNS = 3
NUM_CALIBRATION_SAMPLES = 20 
TARGET_SAMPLE_RATE = 16000


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Используемое устройство: {device}, тип данных: {torch_dtype}")


print(f"Загрузка модели {MODEL_ID}...")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)
model.eval()
print("Модель загружена.")


print(f"Загрузка процессора для {MODEL_ID}...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
print("Процессор загружен.")

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")
print("Метрики WER и CER загружены.")


forced_decoder_ids = processor.get_decoder_prompt_ids(language="russian", task="transcribe")
print("forced_decoder_ids подготовлены.")

Используемое устройство: cuda, тип данных: torch.float16
Загрузка модели openai/whisper-large-v3...
Модель загружена.
Загрузка процессора для openai/whisper-large-v3...
Процессор загружен.
Метрики WER и CER загружены.
forced_decoder_ids подготовлены.


In [4]:

current_model_dtype = model.dtype
current_model_device = model.device
print(f"--- Оценка для {MODEL_ID} на {current_model_device} с dtype {current_model_dtype} ---")


forced_decoder_ids = processor.get_decoder_prompt_ids(language="russian", task="transcribe")


def evaluate_quality_whisper(model_to_eval, processor_to_use, dataset_to_eval, num_samples, device_to_use, dtype_to_use, forced_decoder_ids_for_eval):
    model_to_eval.eval()
    _predictions = []
    _references = []
    
    for i in tqdm(range(min(num_samples, len(dataset_to_eval))), desc="Оценка качества"):
        sample = dataset_to_eval[i]
        reference_text = sample["sentence"]
        if not reference_text or reference_text.strip() == "":
            continue

        raw_audio = sample["audio"]["array"]
        sampling_rate = sample["audio"]["sampling_rate"]

        if len(raw_audio) == 0:
            _predictions.append("")
            _references.append(reference_text.lower())
            continue
        
        input_features = processor_to_use(raw_audio, sampling_rate=sampling_rate, return_tensors="pt").input_features
        input_features = input_features.to(device_to_use, dtype=dtype_to_use)

        with torch.no_grad():
            predicted_ids = model_to_eval.generate(input_features, forced_decoder_ids=forced_decoder_ids_for_eval)
        
        predicted_text = processor_to_use.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        _predictions.append(predicted_text.strip().lower())
        _references.append(reference_text.lower())
        del input_features, predicted_ids
        if device_to_use.type == 'cuda':
            torch.cuda.empty_cache()

    _wer, _cer = "N/A", "N/A"
    if _predictions and _references:
        _wer = wer_metric.compute(predictions=_predictions, references=_references)
        _cer = cer_metric.compute(predictions=_predictions, references=_references)
    return _wer, _cer, _predictions, _references


def evaluate_performance_whisper(model_to_eval, processor_to_use, audio_list_for_timing, num_warmup, num_test, device_to_use, dtype_to_use, forced_decoder_ids_for_eval):
    model_to_eval.eval()
    _times = []
    _vram_usage_mb, _ram_usage_mb = "N/A", "N/A"

    for i in range(min(num_warmup, len(audio_list_for_timing))):
        raw_audio = audio_list_for_timing[i]
        input_features = processor_to_use(raw_audio, sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt").input_features.to(device_to_use, dtype=dtype_to_use)
        with torch.no_grad():
            _ = model_to_eval.generate(input_features, forced_decoder_ids=forced_decoder_ids_for_eval)
        del input_features
        if device_to_use.type == 'cuda':
            torch.cuda.synchronize()
            
    if device_to_use.type == 'cuda':
        torch.cuda.reset_peak_memory_stats(device_to_use)
        torch.cuda.synchronize()

    
    ps_process = psutil.Process(os.getpid())
    ram_before_mb = ps_process.memory_info().rss / (1024 * 1024)

    for i in range(min(num_test, len(audio_list_for_timing))):
        raw_audio = audio_list_for_timing[i]
        input_features = processor_to_use(raw_audio, sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt").input_features.to(device_to_use, dtype=dtype_to_use)
        
        if device_to_use.type == 'cuda':
            torch.cuda.synchronize()
        start_time = time.perf_counter()
        
        with torch.no_grad():
            _ = model_to_eval.generate(input_features, forced_decoder_ids=forced_decoder_ids_for_eval)
        
        if device_to_use.type == 'cuda':
            torch.cuda.synchronize()
        end_time = time.perf_counter()
        _times.append((end_time - start_time) * 1000) # ms
        
        if i == 0 and device_to_use.type == 'cuda': 
            _vram_usage_mb = torch.cuda.max_memory_allocated(device_to_use) / (1024 * 1024)
        del input_features
        if device_to_use.type == 'cuda':
            torch.cuda.empty_cache()

    _avg_time_ms = sum(_times) / len(_times) if _times else "N/A"
    _ram_usage_mb = ps_process.memory_info().rss / (1024 * 1024)
    
    return _avg_time_ms, _vram_usage_mb if device_to_use.type == 'cuda' else "N/A", _ram_usage_mb

print(f"Загрузка датасета {DATASET_ID} для оценки качества ({DATASET_SPLIT} split)...")
quality_dataset = load_dataset(DATASET_ID, DATASET_NAME, split=f"{DATASET_SPLIT}[:{NUM_SAMPLES_FOR_QUALITY_TEST}]", trust_remote_code=True)
quality_dataset = quality_dataset.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))

print(f"Загрузка датасета {DATASET_ID} для замеров производительности ({DATASET_SPLIT} split)...")
timing_dataset_raw = load_dataset(DATASET_ID, DATASET_NAME, split=f"{DATASET_SPLIT}[:{NUM_SAMPLES_FOR_TIME_TEST + NUM_WARMUP_RUNS}]", trust_remote_code=True)
timing_dataset_raw = timing_dataset_raw.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))
raw_audio_list_for_timing = [sample["audio"]["array"] for sample in timing_dataset_raw]


print("\nПроведение оценки качества...")
fp16_wer, fp16_cer, _, _ = evaluate_quality_whisper(model, processor, quality_dataset, NUM_SAMPLES_FOR_QUALITY_TEST, current_model_device, current_model_dtype, forced_decoder_ids)
print(f"Качество (WER): {fp16_wer if isinstance(fp16_wer, str) else fp16_wer:.4f}")
print(f"Качество (CER): {fp16_cer if isinstance(fp16_cer, str) else fp16_cer:.4f}")

print("\nПроведение замеров производительности...")
fp16_time_ms, fp16_vram_mb, fp16_ram_mb = evaluate_performance_whisper(model, processor, raw_audio_list_for_timing, NUM_WARMUP_RUNS, NUM_SAMPLES_FOR_TIME_TEST, current_model_device, current_model_dtype, forced_decoder_ids)


num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
bytes_per_param = 2 if current_model_dtype == torch.float16 else 4 
model_size_mb = (num_params * bytes_per_param) / (1024 * 1024)

print("\n--- Результаты FP16 (или FP32 на CPU) ---")
print(f"Модель: {MODEL_ID}")
print(f"Dtype: {current_model_dtype}")
print(f"Устройство: {current_model_device}")
print(f"Размер модели (прибл. MB): {model_size_mb:.2f}")
print(f"Время инференса ({current_model_device.type.upper()}, ms): {fp16_time_ms if isinstance(fp16_time_ms, str) else f'{fp16_time_ms:.2f}'}")
if current_model_device.type == 'cuda':
    print(f"Использование VRAM (MB): {fp16_vram_mb if isinstance(fp16_vram_mb, str) else f'{fp16_vram_mb:.2f}'}")
print(f"Использование RAM (MB): {fp16_ram_mb if isinstance(fp16_ram_mb, str) else f'{fp16_ram_mb:.2f}'}") 
print(f"Качество (WER): {fp16_wer if isinstance(fp16_wer, str) else f'{fp16_wer:.4f}'}")
print(f"Качество (CER): {fp16_cer if isinstance(fp16_cer, str) else f'{fp16_cer:.4f}'}")


results_fp16 = {
    "method": "FP16" if current_model_device.type == 'cuda' else "FP32 (Baseline CPU)",
    "model_id": MODEL_ID,
    "dtype": str(current_model_dtype),
    "device": str(current_model_device),
    "model_size_mb": f"{model_size_mb:.2f}",
    "time_ms": f"{fp16_time_ms:.2f}" if isinstance(fp16_time_ms, (int, float)) else fp16_time_ms,
    "vram_mb": f"{fp16_vram_mb:.2f}" if isinstance(fp16_vram_mb, (int, float)) else fp16_vram_mb,
    "ram_mb": f"{fp16_ram_mb:.2f}" if isinstance(fp16_ram_mb, (int, float)) else fp16_ram_mb,
    "wer": f"{fp16_wer:.4f}" if isinstance(fp16_wer, (int, float)) else fp16_wer,
    "cer": f"{fp16_cer:.4f}" if isinstance(fp16_cer, (int, float)) else fp16_cer,
}

print("\nРезультаты для таблицы (FP16):")
print(results_fp16)

--- Оценка для openai/whisper-large-v3 на cuda:0 с dtype torch.float16 ---
Загрузка датасета mozilla-foundation/common_voice_16_1 для оценки качества (test split)...
Загрузка датасета mozilla-foundation/common_voice_16_1 для замеров производительности (test split)...

Проведение оценки качества...


Оценка качества:   0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Оценка качества: 100%|██████████| 50/50 [00:23<00:00,  2.15it/s]


Качество (WER): 0.1468
Качество (CER): 0.0490

Проведение замеров производительности...

--- Результаты FP16 (или FP32 на CPU) ---
Модель: openai/whisper-large-v3
Dtype: torch.float16
Устройство: cuda:0
Размер модели (прибл. MB): 2940.31
Время инференса (CUDA, ms): 536.19
Использование VRAM (MB): 3204.42
Использование RAM (MB): 1707.11
Качество (WER): 0.1468
Качество (CER): 0.0490

Результаты для таблицы (FP16):
{'method': 'FP16', 'model_id': 'openai/whisper-large-v3', 'dtype': 'torch.float16', 'device': 'cuda:0', 'model_size_mb': '2940.31', 'time_ms': '536.19', 'vram_mb': '3204.42', 'ram_mb': '1707.11', 'wer': '0.1468', 'cer': '0.0490'}


In [19]:
# --- INT8 динамическая квантизация Whisper-large-v3 (CPU) ---
print(f"--- INT8 Dynamic quantization for {MODEL_ID} (CPU) ---")

device_cpu = torch.device("cpu")

model_int8 = torch.quantization.quantize_dynamic(
    AutoModelForSpeechSeq2Seq.from_pretrained(
        MODEL_ID,
        low_cpu_mem_usage=True,
        use_safetensors=True,
        torch_dtype=torch.float32
    ).to(device_cpu),
    {torch.nn.Linear},
    dtype=torch.qint8
)
model_int8.eval()

processor_int8 = AutoProcessor.from_pretrained(MODEL_ID)
forced_decoder_ids_int8 = processor_int8.get_decoder_prompt_ids(
    language="russian",
    task="transcribe"
)

if "quality_dataset" not in locals() or len(quality_dataset) != NUM_SAMPLES_FOR_QUALITY_TEST:
    quality_dataset = load_dataset(
        DATASET_ID, DATASET_NAME,
        split=f"{DATASET_SPLIT}[:{NUM_SAMPLES_FOR_QUALITY_TEST}]",
        trust_remote_code=True
    ).cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))

wer_int8, cer_int8, _, _ = evaluate_quality_whisper(
    model_int8,
    processor_int8,
    quality_dataset,
    NUM_SAMPLES_FOR_QUALITY_TEST,
    device_cpu,
    torch.float32,
    forced_decoder_ids_int8
)

if "raw_audio_list_for_timing" not in locals():
    timing_dataset_raw = load_dataset(
        DATASET_ID, DATASET_NAME,
        split=f"{DATASET_SPLIT}[:{NUM_SAMPLES_FOR_TIME_TEST + NUM_WARMUP_RUNS}]",
        trust_remote_code=True
    ).cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))
    raw_audio_list_for_timing = [s["audio"]["array"] for s in timing_dataset_raw]

time_int8_ms, _, ram_int8_mb = evaluate_performance_whisper(
    model_int8,
    processor_int8,
    raw_audio_list_for_timing,
    NUM_WARMUP_RUNS,
    NUM_SAMPLES_FOR_TIME_TEST,
    device_cpu,
    torch.float32,
    forced_decoder_ids_int8
)

tmp_path = "temp_int8_dynamic.pth"
torch.save(model_int8.state_dict(), tmp_path)
size_int8_mb = os.path.getsize(tmp_path) / (1024 * 1024)
os.remove(tmp_path)

print("\n--- INT8 Dynamic results ---")
print(f"WER:   {wer_int8:.4f}")
print(f"CER:   {cer_int8:.4f}")
print(f"Time:  {time_int8_ms:.2f} ms (CPU)")
print(f"RAM:   {ram_int8_mb:.2f} MB")
print(f"Size:  {size_int8_mb:.2f} MB")

results_int8_dynamic = {
    "method": "INT8 Dynamic (CPU)",
    "model_id": MODEL_ID,
    "dtype": "torch.qint8 (dynamic)",
    "device": str(device_cpu),
    "model_size_mb": f"{size_int8_mb:.2f}",
    "time_ms": f"{time_int8_ms:.2f}",
    "vram_mb": "N/A",
    "ram_mb": f"{ram_int8_mb:.2f}",
    "wer": f"{wer_int8:.4f}",
    "cer": f"{cer_int8:.4f}",
}
print("\nDict for table:")
print(results_int8_dynamic)

del model_int8
gc.collect()

--- INT8 Dynamic quantization for openai/whisper-large-v3 (CPU) ---


Оценка качества: 100%|██████████| 50/50 [01:48<00:00,  2.17s/it]



--- INT8 Dynamic results ---
WER:   0.1519
CER:   0.0500
Time:  2394.85 ms (CPU)
RAM:   71362.42 MB
Size:  1752.11 MB

Dict for table:
{'method': 'INT8 Dynamic (CPU)', 'model_id': 'openai/whisper-large-v3', 'dtype': 'torch.qint8 (dynamic)', 'device': 'cpu', 'model_size_mb': '1752.11', 'time_ms': '2394.85', 'vram_mb': 'N/A', 'ram_mb': '71362.42', 'wer': '0.1519', 'cer': '0.0500'}


11702

In [20]:
import torch.nn.utils.prune as prune

print(f"--- Неструктурированный прунинг для {MODEL_ID} ---")


PRUNING_AMOUNT = 0.2 
LAYERS_TO_PRUNE = [torch.nn.Linear]



pruning_device = device 
pruning_dtype = torch_dtype 

print(f"Загрузка модели {MODEL_ID} для прунинга на {pruning_device} с dtype {pruning_dtype}...")
model_to_prune = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=pruning_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
).to(pruning_device)
model_to_prune.eval()
print("Модель для прунинга загружена.")

print(f"Применение неструктурированного прунинга (amount={PRUNING_AMOUNT})...")
for module_name, module in model_to_prune.named_modules():
    for layer_type in LAYERS_TO_PRUNE:
        if isinstance(module, layer_type):
            prune.l1_unstructured(module, name="weight", amount=PRUNING_AMOUNT)
           
            prune.remove(module, 'weight')
print("Прунинг применен и сделан постоянным.")

if 'quality_dataset' not in locals() or 'raw_audio_list_for_timing' not in locals():
    print("Перезагрузка датасетов, так как они не найдены в текущем контексте...")
    quality_dataset = load_dataset(DATASET_ID, DATASET_NAME, split=f"{DATASET_SPLIT}[:{NUM_SAMPLES_FOR_QUALITY_TEST}]", trust_remote_code=True)
    quality_dataset = quality_dataset.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))
    timing_dataset_raw = load_dataset(DATASET_ID, DATASET_NAME, split=f"{DATASET_SPLIT}[:{NUM_SAMPLES_FOR_TIME_TEST + NUM_WARMUP_RUNS}]", trust_remote_code=True)
    timing_dataset_raw = timing_dataset_raw.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))
    raw_audio_list_for_timing = [sample["audio"]["array"] for sample in timing_dataset_raw]
    print("Датасеты перезагружены.")


print("\nПроведение оценки качества (Прореженная модель)...")
pruned_wer, pruned_cer, _, _ = evaluate_quality_whisper(
    model_to_prune, processor, quality_dataset, NUM_SAMPLES_FOR_QUALITY_TEST,
    pruning_device, pruning_dtype, forced_decoder_ids
)
print(f"Качество (WER) Прореженная модель: {pruned_wer if isinstance(pruned_wer, str) else pruned_wer:.4f}")
print(f"Качество (CER) Прореженная модель: {pruned_cer if isinstance(pruned_cer, str) else pruned_cer:.4f}")

print("\nПроведение замеров производительности (Прореженная модель)...")
pruned_time_ms, pruned_vram_mb, pruned_ram_mb = evaluate_performance_whisper(
    model_to_prune, processor, raw_audio_list_for_timing, NUM_WARMUP_RUNS, NUM_SAMPLES_FOR_TIME_TEST,
    pruning_device, pruning_dtype, forced_decoder_ids
)

pruned_num_params = sum(p.numel() for p in model_to_prune.parameters() if p.requires_grad)

pruned_nnz_params = 0
for name, module in model_to_prune.named_modules():
    for layer_type in LAYERS_TO_PRUNE:
        if isinstance(module, layer_type):
            pruned_nnz_params += torch.count_nonzero(module.weight).item()

        elif not any(isinstance(module, lt) for lt in LAYERS_TO_PRUNE) and hasattr(module, 'weight') and module.weight is not None and module.weight.requires_grad:
             pruned_nnz_params += module.weight.numel() 
        if hasattr(module, 'bias') and module.bias is not None and module.bias.requires_grad:
            pruned_nnz_params += module.bias.numel()


bytes_per_param_pruning = 2 if pruning_dtype == torch.float16 else 4

pruned_model_size_mb_effective = (pruned_nnz_params * bytes_per_param_pruning) / (1024 * 1024)

pruned_model_size_mb_total = (pruned_num_params * bytes_per_param_pruning) / (1024 * 1024)


print("\n--- Результаты: Неструктурированный прунинг ---")
print(f"Модель: {MODEL_ID}")
print(f"Метод: Прунинг (l1_unstructured, amount={PRUNING_AMOUNT})")
print(f"Dtype: {pruning_dtype}")
print(f"Устройство: {pruning_device}")
print(f"Размер модели (общий, MB): {pruned_model_size_mb_total:.2f}")
print(f"Размер модели (эффективный, MB, на осн. ненулевых): {pruned_model_size_mb_effective:.2f}")
print(f"Время инференса ({pruning_device.type.upper()}, ms): {pruned_time_ms if isinstance(pruned_time_ms, str) else f'{pruned_time_ms:.2f}'}")
if pruning_device.type == 'cuda':
    print(f"Использование VRAM (MB): {pruned_vram_mb if isinstance(pruned_vram_mb, str) else f'{pruned_vram_mb:.2f}'}")
print(f"Использование RAM (MB): {pruned_ram_mb if isinstance(pruned_ram_mb, str) else f'{pruned_ram_mb:.2f}'}")
print(f"Качество (WER): {pruned_wer if isinstance(pruned_wer, str) else f'{pruned_wer:.4f}'}")
print(f"Качество (CER): {pruned_cer if isinstance(pruned_cer, str) else f'{pruned_cer:.4f}'}")

results_pruning = {
    "method": f"Pruning L1Unstr. ({PRUNING_AMOUNT*100}%)",
    "model_id": MODEL_ID,
    "dtype": str(pruning_dtype),
    "device": str(pruning_device),
    "model_size_mb": f"{pruned_model_size_mb_effective:.2f} (eff.) / {pruned_model_size_mb_total:.2f} (total)",
    "time_ms": f"{pruned_time_ms:.2f}" if isinstance(pruned_time_ms, (int, float)) else pruned_time_ms,
    "vram_mb": f"{pruned_vram_mb:.2f}" if isinstance(pruned_vram_mb, (int, float)) else pruned_vram_mb,
    "ram_mb": f"{pruned_ram_mb:.2f}" if isinstance(pruned_ram_mb, (int, float)) else pruned_ram_mb,
    "wer": f"{pruned_wer:.4f}" if isinstance(pruned_wer, (int, float)) else pruned_wer,
    "cer": f"{pruned_cer:.4f}" if isinstance(pruned_cer, (int, float)) else pruned_cer,
}
print("\nРезультаты для таблицы (Прунинг):")
print(results_pruning)

# Очистка
del model_to_prune
gc.collect()
if pruning_device.type == 'cuda':
    torch.cuda.empty_cache()

--- Неструктурированный прунинг для openai/whisper-large-v3 ---
Загрузка модели openai/whisper-large-v3 для прунинга на cuda с dtype torch.float16...
Модель для прунинга загружена.
Применение неструктурированного прунинга (amount=0.2)...
Прунинг применен и сделан постоянным.

Проведение оценки качества (Прореженная модель)...


Оценка качества: 100%|██████████| 50/50 [00:22<00:00,  2.21it/s]


Качество (WER) Прореженная модель: 0.1215
Качество (CER) Прореженная модель: 0.0497

Проведение замеров производительности (Прореженная модель)...

--- Результаты: Неструктурированный прунинг ---
Модель: openai/whisper-large-v3
Метод: Прунинг (l1_unstructured, amount=0.2)
Dtype: torch.float16
Устройство: cuda
Размер модели (общий, MB): 2940.31
Размер модели (эффективный, MB, на осн. ненулевых): 2481.61
Время инференса (CUDA, ms): 538.42
Использование VRAM (MB): 6151.46
Использование RAM (MB): 70791.42
Качество (WER): 0.1215
Качество (CER): 0.0497

Результаты для таблицы (Прунинг):
{'method': 'Pruning L1Unstr. (20.0%)', 'model_id': 'openai/whisper-large-v3', 'dtype': 'torch.float16', 'device': 'cuda', 'model_size_mb': '2481.61 (eff.) / 2940.31 (total)', 'time_ms': '538.42', 'vram_mb': '6151.46', 'ram_mb': '70791.42', 'wer': '0.1215', 'cer': '0.0497'}
