In [1]:
%pip install -q "nncf>=2.13.0"
%pip install -q --pre -U "openvino" "openvino-tokenizers" "openvino-genai" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
%pip install -q "python-ffmpeg<=1.0.16" "ffmpeg" "moviepy" "transformers>=4.45" "git+https://github.com/huggingface/optimum-intel.git" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu
%pip install -q -U "yt_dlp>=2024.8.6" soundfile librosa jiwer
%pip install -q  "gradio>=4.19"

Note: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
openvino-dev 2024.6.0 requires openvino==2024.6.0, but you have openvino 2025.0.0.dev20241226 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from pathlib import Path

if not Path("notebook_utils.py").exists():
    r = requests.get(
        url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
    )
    open("notebook_utils.py", "w").write(r.text)

if not Path("cmd_helper.py").exists():
    r = requests.get(
        url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py",
    )
    open("cmd_helper.py", "w").write(r.text)

In [4]:
# Instantiate model

import ipywidgets as widgets

MODELS = [
    "openai/whisper-large-v3-turbo",
    "openai/whisper-large-v3",
    "openai/whisper-large-v2",
    "openai/whisper-large",
    "openai/whisper-medium",
    "openai/whisper-small",
    "openai/whisper-base",
    "openai/whisper-tiny",
]

model_id = widgets.Dropdown(
    options=list(MODELS),
    value="openai/whisper-tiny",
    description="Model:",
    disabled=False,
)

model_id

Dropdown(description='Model:', index=7, options=('openai/whisper-large-v3-turbo', 'openai/whisper-large-v3', '…

In [5]:
# Convert model to OpenVINO format

# export command: optimum-cli export openvino --model openai/whisper-tiny whisper-tiny

from cmd_helper import optimum_cli

model_dir = model_id.value.split("/")[-1]

if not Path(model_dir).exists():
    optimum_cli(model_id.value, model_dir)

**Export command:**

`optimum-cli export openvino --model openai/whisper-tiny whisper-tiny`

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  if input_features.shape[-1] != expected_seq_length:
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  if sequence_length != 1:
  or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
  elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors


In [6]:
# Select device

from notebook_utils import device_widget

device = device_widget(default="CPU", exclude=["NPU"])

device

Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')

In [7]:
import openvino_genai

ov_pipe = openvino_genai.WhisperPipeline(str(model_dir), device=device.value)

In [8]:
# Run transcription pipeline

output_file = Path("song_lyrics.mp4") # should be to the video file of the song

task = widgets.Select(
    options=["transcribe", "translate"],
    value="transcribe",
    description="Select task:",
    disabled=False,
)
task

Select(description='Select task:', options=('transcribe', 'translate'), value='transcribe')

In [9]:
from transformers.pipelines.audio_utils import ffmpeg_read
import wave
import contextlib

def get_audio(audio_file_path):
    """
    Extract audio signal from a given video file, then convert it to float,
    then mono-channel format and resample it to the expected sample rate

    Parameters:
        audio_file: path to input audio file
    Returns:
      resampled_audio: mono-channel float audio signal with 16000 Hz sample rate
                       extracted from video
      duration: duration of audio fragment in seconds
    """
    with contextlib.closing(wave.open(audio_file_path,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
    with open(audio_file_path, "rb") as f:
        inputs = f.read()
    audio = ffmpeg_read(inputs, 16000)
    return {
        "raw": audio,
        "sampling_rate": 16000,
    }, duration

In [10]:
inputs, duration = get_audio("test.wav")

transcription = ov_pipe.generate(inputs["raw"], task=task.value, return_timestamps=True).chunks

In [10]:
import math


def format_timestamp(seconds: float):
    """
    format time in srt-file expected format
    """
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    return (f"{hours}:" if hours > 0 else "00:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}"


def prepare_srt(transcription, filter_duration=None):
    """
    Format transcription into srt file format
    """
    segment_lines = []
    for idx, segment in enumerate(transcription):
        timestamp = (segment.start_ts, segment.end_ts)
        # for the case where the model could not predict an ending timestamp, which can happen if audio is cut off in the middle of a word.
        if segment.end_ts == -1:
            timestamp[1] = filter_duration

        if filter_duration is not None and (timestamp[0] >= math.floor(filter_duration) or timestamp[1] > math.ceil(filter_duration) + 1):
            break
        segment_lines.append(str(idx + 1) + "\n")
        time_start = format_timestamp(timestamp[0])
        time_end = format_timestamp(timestamp[1])
        time_str = f"{time_start} --> {time_end}\n"
        segment_lines.append(time_str)
        segment_lines.append(segment.text + "\n\n")
    return segment_lines

In [11]:
srt_lines = prepare_srt(transcription, filter_duration=duration)
# save transcription
with output_file.with_suffix(".srt").open("w") as f:
    f.writelines(srt_lines)

In [12]:
print("".join(srt_lines))

1
00:00:00,000 --> 00:00:09,000
 Twinkle little star, how I wonder what you are.




In [13]:
# Post training quantization

In [14]:
to_quantize = widgets.Checkbox(
    value=True,
    description="Quantization",
    disabled=False,
)

to_quantize

Checkbox(value=True, description='Quantization')

In [15]:
# Fetch `skip_kernel_extension` module
import requests

r = requests.get(
    url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py",
)
open("skip_kernel_extension.py", "w").write(r.text)

ov_quantized_model = None

%load_ext skip_kernel_extension

In [16]:
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForSpeechSeq2Seq

ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device=device.value)
processor = AutoProcessor.from_pretrained(model_dir)

In [17]:
%%skip not $to_quantize.value

from itertools import islice
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import pipeline
from optimum.intel.openvino.quantization import InferRequestWrapper


def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):
    # Overwrite model request properties, saving the original ones for restoring later
    encoder_calibration_data = []
    decoder_calibration_data = []
    ov_model.encoder.request = InferRequestWrapper(ov_model.encoder.request, encoder_calibration_data, apply_caching=True)
    ov_model.decoder_with_past.request = InferRequestWrapper(ov_model.decoder_with_past.request,
                                                             decoder_calibration_data,
                                                             apply_caching=True)

    pipe = pipeline(
      "automatic-speech-recognition",
      model=ov_model,
      chunk_length_s=30,
      tokenizer=processor.tokenizer,
      feature_extractor=processor.feature_extractor)
    try:
        calibration_dataset = dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True)
        for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc="Collecting calibration data",
                           total=calibration_dataset_size):
            pipe(sample["audio"], generate_kwargs={"task": task.value}, return_timestamps=True)
    finally:
        ov_model.encoder.request = ov_model.encoder.request.request
        ov_model.decoder_with_past.request = ov_model.decoder_with_past.request.request

    return encoder_calibration_data, decoder_calibration_data

In [18]:
%%skip not $to_quantize.value

import gc
import shutil
import nncf
import openvino as ov


CALIBRATION_DATASET_SIZE = 30
quantized_model_path = Path(f"{model_dir}_quantized")


def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):
    if not quantized_model_path.exists():
        encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(ov_model, calibration_dataset_size)
        print("Quantizing encoder")
        quantized_encoder = nncf.quantize(
            ov_model.encoder.model,
            nncf.Dataset(encoder_calibration_data),
            subset_size=len(encoder_calibration_data),
            model_type=nncf.ModelType.TRANSFORMER,
            # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search
            advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.80),
        )
        ov.save_model(quantized_encoder, quantized_model_path / "openvino_encoder_model.xml")
        del quantized_encoder
        del encoder_calibration_data
        gc.collect()

        print("Quantizing decoder with past")
        quantized_decoder_with_past = nncf.quantize(
            ov_model.decoder_with_past.model,
            nncf.Dataset(decoder_calibration_data),
            subset_size=len(decoder_calibration_data),
            model_type=nncf.ModelType.TRANSFORMER,
            # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search
            advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96),
        )
        ov.save_model(quantized_decoder_with_past, quantized_model_path / "openvino_decoder_with_past_model.xml")
        del quantized_decoder_with_past
        del decoder_calibration_data
        gc.collect()

        # Copy the config file and the first-step-decoder manually
        model_path = Path(model_dir)
        shutil.copy(model_path / "config.json", quantized_model_path / "config.json")
        shutil.copy(model_path / "generation_config.json", quantized_model_path / "generation_config.json")
        shutil.copy(model_path / "openvino_decoder_model.xml", quantized_model_path / "openvino_decoder_model.xml")
        shutil.copy(model_path / "openvino_decoder_model.bin", quantized_model_path / "openvino_decoder_model.bin")
        shutil.copy(model_path / "openvino_tokenizer.xml", quantized_model_path / "openvino_tokenizer.xml")
        shutil.copy(model_path / "openvino_tokenizer.bin", quantized_model_path / "openvino_tokenizer.bin")
        shutil.copy(model_path / "openvino_detokenizer.xml", quantized_model_path / "openvino_detokenizer.xml")
        shutil.copy(model_path / "openvino_detokenizer.bin", quantized_model_path / "openvino_detokenizer.bin")
        shutil.copy(model_path / "tokenizer_config.json", quantized_model_path / "tokenizer_config.json")
        shutil.copy(model_path / "tokenizer.json", quantized_model_path / "tokenizer.json")
        shutil.copy(model_path / "vocab.json", quantized_model_path / "vocab.json")
        shutil.copy(model_path / "preprocessor_config.json", quantized_model_path / "preprocessor_config.json")
        shutil.copy(model_path / "special_tokens_map.json", quantized_model_path / "special_tokens_map.json")
        shutil.copy(model_path / "normalizer.json", quantized_model_path / "normalizer.json")
        shutil.copy(model_path / "merges.txt", quantized_model_path / "merges.txt")
        shutil.copy(model_path / "added_tokens.json", quantized_model_path / "added_tokens.json")

    quantized_ov_pipe = openvino_genai.WhisperPipeline(str(quantized_model_path), device=device.value)
    return quantized_ov_pipe


quantized_ov_pipe = quantize(ov_model, CALIBRATION_DATASET_SIZE)

In [19]:
if ov_quantized_model is not None:
    inputs, duration = get_audio(output_file)
    transcription = quantized_ov_pipe.generate(inputs["raw"], task=task.value, return_timestamps=True).chunks
    srt_lines = prepare_srt(transcription, filter_duration=duration)
    print("".join(srt_lines))
    widgets.Video.from_file(output_file, loop=False, width=800, height=800)

In [20]:
%%skip not $to_quantize.value

import time
from contextlib import contextmanager
from jiwer import wer, wer_standardize

TEST_DATASET_SIZE = 50

def calculate_transcription_time_and_accuracy(ov_model, test_samples):
    whole_infer_times = []

    ground_truths = []
    predictions = []
    for data_item in tqdm(test_samples, desc="Measuring performance and accuracy"):
        start_time = time.perf_counter()
        transcription = ov_model.generate(data_item["audio"]["array"], return_timestamps=True)
        end_time = time.perf_counter()
        whole_infer_times.append(end_time - start_time)

        ground_truths.append(data_item["text"])
        predictions.append(transcription.texts[0])

    word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize,
                             hypothesis_transform=wer_standardize)) * 100
    mean_whole_infer_time = sum(whole_infer_times)
    return word_accuracy, mean_whole_infer_time

test_dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True)
test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE)
test_samples = [sample for sample in test_dataset]

accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_pipe, test_samples)
accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(quantized_ov_pipe, test_samples)
print(f"Whole pipeline performance speedup: {times_original / times_quantized:.3f}")
print(f"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.")
print(f"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.")

KeyboardInterrupt: 