In [1]:
import torch
import time
import psutil
from tqdm import tqdm

from transformers import  WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
from collections import defaultdict
from evaluate import load


  from .autonotebook import tqdm as notebook_tqdm
2025-05-02 02:34:13.159145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746142453.170509  509452 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746142453.173560  509452 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746142453.181562  509452 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746142453.181653  509452 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746142453.181654  509452

In [2]:
MODEL_NAME = "openai/whisper-small"  # 'openai/whisper-large-v2'

model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
processor = WhisperProcessor.from_pretrained(MODEL_NAME)

In [3]:
wer = load("wer")
cer = load("cer")

dataset = load_dataset("librispeech_asr", "clean", split="test")

In [9]:
def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    return round(size_all_mb, 2)


def inference_time(model, device="cuda", dtype=torch.float32, s=30):
    sample_audio = torch.randn(1, 16000 * s).squeeze().numpy()
    input_features = processor(sample_audio, sampling_rate=16000, return_tensors="pt")

    model.to(device)
    input_tensor = input_features.input_features.to(device)

    start_time = time.time()
    with torch.no_grad():
        output = model.generate(input_tensor.to(dtype), return_timestamps=True)[0]
    processor.decode(output)

    elapsed_time = (time.time() - start_time) * 1000

    return round(elapsed_time / s, 2)


def predict(batch, model, device="cuda", dtype=torch.float32):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
    input_features = input_features.to(dtype)

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to(device))[0]
    transcription = processor.decode(predicted_ids)
    return processor.tokenizer._normalize(transcription)


def metrics(model, dataset, device="cuda", dtype=torch.float32, num_examples=64):
    res = defaultdict(list)
    for el in tqdm(dataset.select(range(num_examples))):
        res["reference"].append(processor.tokenizer._normalize(el["text"]))
        res["prediction"].append(predict(el, model, device=device, dtype=dtype))

    cer_res = 100 * cer.compute(references=res["reference"], predictions=res["prediction"])
    wer_res = 100 * wer.compute(references=res["reference"], predictions=res["prediction"])

    return {"WER%": round(wer_res, 2), "CER%": round(cer_res, 2)}

In [7]:
print('Model size: ', model_size(model))
print('CPU inference time : ', inference_time(model, device="cpu"))
print('GPU inference time : ', inference_time(model, device="cuda"))
print('Metrics: ', metrics(model, dataset, device="cuda"))

Model size:  922.1455078125
CPU inference time :  389.37
GPU inference time :  151.4


100%|██████████| 64/64 [00:21<00:00,  2.97it/s]

Metrics:  {'WER%': 3.83, 'CER%': 1.34}





## quantization

In [10]:
# FP16
model_fp16 = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)

print('Model size: ', model_size(model_fp16))
print('CPU inference time : ', inference_time(model_fp16, device="cpu", dtype=torch.float16))
print('GPU inference time : ', inference_time(model_fp16, device="cuda", dtype=torch.float16))
print('Metrics: ', metrics(model_fp16, dataset, device="cuda", dtype=torch.float16))

Model size:  461.07
CPU inference time :  7239.61
GPU inference time :  110.85


100%|██████████| 64/64 [00:14<00:00,  4.36it/s]

Metrics:  {'WER%': 3.83, 'CER%': 1.34}





In [8]:
# INT8
model_int8 = torch.quantization.quantize_dynamic(
    model.to('cpu'),
    {torch.nn.Conv1d, torch.nn.Linear},
    dtype=torch.qint8
)

print('Model size: ', model_size(model_int8))
print('CPU inference time : ', inference_time(model_int8, device="cpu"))
print('Metrics: ', metrics(model_int8, dataset, device="cpu"))

Model size:  165.4775390625
CPU inference time :  287.9


100%|██████████| 64/64 [02:04<00:00,  1.95s/it]

Metrics:  {'WER%': 4.26, 'CER%': 1.53}





## pruning

In [11]:
import copy
import itertools
from torch.nn.utils import prune

p_model = copy.deepcopy(model)

In [12]:
module = p_model.model.encoder.layers[0]
print(module)

WhisperEncoderLayer(
  (self_attn): WhisperSdpaAttention(
    (k_proj): Linear(in_features=768, out_features=768, bias=False)
    (v_proj): Linear(in_features=768, out_features=768, bias=True)
    (q_proj): Linear(in_features=768, out_features=768, bias=True)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
  )
  (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (activation_fn): GELUActivation()
  (fc1): Linear(in_features=768, out_features=3072, bias=True)
  (fc2): Linear(in_features=3072, out_features=768, bias=True)
  (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [13]:
parameters_to_prune = tuple(
    list(
        itertools.chain(
            *[
                [
                    (p_model.model.encoder.layers[el].self_attn.k_proj, "weight"),
                    (p_model.model.encoder.layers[el].self_attn.v_proj, "weight"),
                    (p_model.model.encoder.layers[el].self_attn.q_proj, "weight"),
                    (p_model.model.encoder.layers[el].self_attn.out_proj, "weight"),
                    (p_model.model.encoder.layers[el].self_attn.out_proj, "weight"),
                    (p_model.model.encoder.layers[el].fc1, "weight"),
                    (p_model.model.encoder.layers[el].fc2, "weight"),
                ]
                for el in range(12)
            ]
        )
    )
)

torch.nn.utils.prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.2,
)

In [14]:
print('Model size: ', model_size(p_model))
print('CPU inference time : ', inference_time(p_model, device="cpu"))
print('GPU inference time : ', inference_time(p_model, device="cuda"))
print('Metrics: ', metrics(p_model, dataset, device="cuda"))

Model size:  1246.15
CPU inference time :  387.41
GPU inference time :  141.56


100%|██████████| 64/64 [00:22<00:00,  2.80it/s]

Metrics:  {'WER%': 3.83, 'CER%': 1.34}



