In [None]:
import torch
import gzip
import os
import copy
import time
import psutil
import io
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans

from transformers import WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
from collections import defaultdict
from evaluate import load
from itertools import islice

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "openai/whisper-small"
N_CLUSTERS = 128
clustered_path = "whisper_clustered.pt"
NUM_EX = 32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

Device: cuda


In [3]:
processor = WhisperProcessor.from_pretrained(MODEL_NAME)

def load_model(device=DEVICE):
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
    return model.to(device)

In [4]:
wer = load("wer")
cer = load("cer")

In [5]:
def predict_example(model, batch, device=DEVICE):
    audio = batch["audio"]
    input_feats = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
    start = time.time()
    # RAM or VRAM before inference
    if device == 'cpu':
        ram_before = psutil.Process().memory_info().rss
    else:
        torch.cuda.reset_peak_memory_stats()
        ram_before = torch.cuda.memory_allocated()
    with torch.no_grad():
        ids = model.generate(input_feats.to(device))[0]
    elapsed_ms = (time.time() - start) * 1000
    if device == 'cpu':
        ram_after = psutil.Process().memory_info().rss
        mem_used_mb = (ram_after - ram_before) / (1024**2)
    else:
        ram_after = torch.cuda.max_memory_allocated()
        mem_used_mb = ram_after / (1024**2)
    txt = processor.decode(ids)
    return processor.tokenizer._normalize(txt), elapsed_ms, mem_used_mb

In [6]:
dataset_stream = load_dataset("librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
dataset = list(islice(dataset_stream, NUM_EX))

# Метрики на дефолтной модели

### cpu

In [10]:
model_orig = load_model('cpu')
orig_refs, orig_preds = [], []
orig_times, orig_rams = [], []
for ex in tqdm(dataset, desc="инференс оригинальной модели"):
    pred, t_ms, m_mb = predict_example(model_orig, ex, device='cpu')
    orig_refs.append(processor.tokenizer._normalize(ex['text']))
    orig_preds.append(pred)
    orig_times.append(t_ms)
    orig_rams.append(m_mb)
orig_wer = wer.compute(references=orig_refs, predictions=orig_preds) * 100
orig_cer = cer.compute(references=orig_refs, predictions=orig_preds) * 100
print(f"CPU: avg time {np.mean(orig_times):.2f} ms | avg RAM Δ {np.mean(orig_rams):.2f} MB | WER {orig_wer:.2f}% | CER {orig_cer:.2f}%")

инференс оригинальной модели: 100%|██████████| 32/32 [01:55<00:00,  3.62s/it]

CPU: avg time 3601.86 ms | avg RAM Δ 29.71 MB | WER 3.66% | CER 1.32%





### gpu

In [11]:
if DEVICE == 'cuda':
    model_orig_gpu = load_model('cuda')
    gpu_refs, gpu_preds = [], []
    gpu_times, gpu_vrams = [], []
    for ex in tqdm(dataset, desc="Original GPU Inference"):
        pred, t_ms, m_mb = predict_example(model_orig_gpu, ex, 'cuda')
        gpu_refs.append(processor.tokenizer._normalize(ex['text']))
        gpu_preds.append(pred)
        gpu_times.append(t_ms)
        gpu_vrams.append(m_mb)
    gpu_wer = wer.compute(references=gpu_refs, predictions=gpu_preds) * 100
    gpu_cer = cer.compute(references=gpu_refs, predictions=gpu_preds) * 100
    print(f"GPU - Time: {np.mean(gpu_times):.2f} ms, VRAM Peak: {np.max(gpu_vrams):.2f} MB, WER: {gpu_wer:.2f}%, CER: {gpu_cer:.2f}%")

Original GPU Inference: 100%|██████████| 32/32 [00:15<00:00,  2.04it/s]

GPU - Time: 477.27 ms, VRAM Peak: 1053.03 MB, WER: 3.66%, CER: 1.32%





# Сохраним оригинальную модель в gzip

In [10]:
orig_path = "whisper_original.pt"
torch.save(model_orig.state_dict(), orig_path)
with open(orig_path, 'rb') as f_in, gzip.open(orig_path + '.gz', 'wb') as f_out:
    f_out.writelines(f_in)
orig_gz_size = os.path.getsize(orig_path + '.gz') / (1024**2)
print(f"Размер изначальной модели в gzip: {orig_gz_size:.2f} MB")

Размер изначальной модели в gzip: 535.87 MB


# Кластеризируем веса

In [11]:
def cluster_model_weights(model, n_clusters=N_CLUSTERS, exclude_prefixes=None):
    if exclude_prefixes is None:
        exclude_prefixes = [
            "model.encoder.conv1", "model.encoder.conv2", "model.encoder.embed_positions",
            "model.decoder.embed_tokens", "model.decoder.embed_positions",
            "model.logits_proj", "lm_head", "final_logits_bias"
        ]
    model_cpu = copy.deepcopy(model).to('cpu')
    params = list(model_cpu.named_parameters())
    total_bytes = 0
    with torch.no_grad():
        for name, param in tqdm(params, desc="Кластеризация", total=len(params)):
            if any(name.startswith(pref) for pref in exclude_prefixes):
                continue
            tensor = param.data
            flat = tensor.view(-1,1).cpu().numpy()
            k = min(n_clusters, flat.shape[0])
            kmeans = KMeans(n_clusters=k, random_state=0).fit(flat)
            centers = kmeans.cluster_centers_.astype(flat.dtype)
            labels = kmeans.labels_.astype(np.uint8 if k<=256 else np.int16)
            clustered = centers[labels].reshape(tensor.shape)
            param.data.copy_(torch.from_numpy(clustered))
            total_bytes += labels.nbytes + centers.nbytes
    print(f"Clustered data bytes: {total_bytes/1e6:.2f} MB")
    return model_cpu

In [12]:
model_to_cluster = load_model(device='cpu')
model_clustered_cpu = cluster_model_weights(model_to_cluster)

[WinError 2] Не удается найти указанный файл
  File "d:\vscode_projects\itmo_compression\hw_3\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\ivann\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ivann\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\ivann\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Кластеризация: 100%|██████████| 479/479 [21:50<00:00,  2.74s/it]

Clustered data bytes: 198.69 MB





# Сохраняем новую модель

In [None]:
torch.save(model_clustered_cpu.state_dict(), clustered_path)
with open(clustered_path, 'rb') as f_in, gzip.open(clustered_path + '.gz', 'wb') as f_out:
    f_out.writelines(f_in)
clustered_gz_size = os.path.getsize(clustered_path + '.gz') / (1024**2)
print(f"Размер новой модели в gzip: {clustered_gz_size:.2f} MB")

Размер новой модели в gzip: 321.95 MB


In [15]:
del model_orig_gpu

import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
model_loaded = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to('cpu')
with gzip.open(clustered_path + '.gz', 'rb') as f_in:
    buffer = f_in.read()
state_dict = torch.load(io.BytesIO(buffer), map_location='cpu')
model_loaded.load_state_dict(state_dict)
model_loaded = model_loaded.to(DEVICE)

# Метрики на скомпрессированной модели

### cpu

In [None]:
model_loaded_cpu = copy.deepcopy(model_loaded).to('cpu')
cl_refs, cl_preds = [], []
cl_times, cl_rams = [], []
for ex in tqdm(dataset, desc="Clustered CPU Inference"):
    pred, t_ms, m_mb = predict_example(model_loaded_cpu, ex, 'cpu')
    cl_refs.append(processor.tokenizer._normalize(ex['text']))
    cl_preds.append(pred)
    cl_times.append(t_ms)
    cl_rams.append(m_mb)
cl_wer = wer.compute(references=cl_refs, predictions=cl_preds) * 100
cl_cer = cer.compute(references=cl_refs, predictions=cl_preds) * 100
print(f"Clustered CPU - Time: {np.mean(cl_times):.2f} ms, RAM Δ: {np.mean(cl_rams):.2f} MB, WER: {cl_wer:.2f}%, CER: {cl_cer:.2f}%")

Clustered CPU Inference:   0%|          | 0/32 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Clustered CPU Inference: 100%|██████████| 32/32 [02:00<00:00,  3.76s/it]

Clustered CPU - Time: 3741.66 ms, RAM Δ: 8.50 MB, WER: 3.82%, CER: 1.37%





### gpu

In [8]:
if DEVICE == 'cuda':
    gpu_refs, gpu_preds = [], []
    gpu_times, gpu_vrams = [], []
    for ex in tqdm(dataset, desc="Original GPU Inference"):
        pred, t_ms, m_mb = predict_example(model_loaded, ex, 'cuda')
        gpu_refs.append(processor.tokenizer._normalize(ex['text']))
        gpu_preds.append(pred)
        gpu_times.append(t_ms)
        gpu_vrams.append(m_mb)
    gpu_wer = wer.compute(references=gpu_refs, predictions=gpu_preds) * 100
    gpu_cer = cer.compute(references=gpu_refs, predictions=gpu_preds) * 100
    print(f"GPU - Time: {np.mean(gpu_times):.2f} ms, VRAM Peak: {np.max(gpu_vrams):.2f} MB, WER: {gpu_wer:.2f}%, CER: {gpu_cer:.2f}%")

Original GPU Inference:   0%|          | 0/32 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Original GPU Inference: 100%|██████████| 32/32 [00:15<00:00,  2.04it/s]

GPU - Time: 475.24 ms, VRAM Peak: 1053.03 MB, WER: 3.82%, CER: 1.37%



