In [None]:
# exporting models by optimum-cli
!optimum-cli export onnx --model openai/whisper-tiny whisper-tiny-with-past/ --task automatic-speech-recognition-with-past --opset 13


# Export and save model to onnx

In [8]:
import os
import time
import shutil
from evaluate import load
from pathlib import Path
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor, AutoConfig
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq, ORTQuantizer, AutoQuantizationConfig
from transformers import PretrainedConfig
import librosa
import numpy as np

In [None]:
# Export model to ONNX
def export_onnx(model_id, save_dir):
    model = ORTModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)
    model_dir = model.model_save_dir
    # print(model.get_memory_footprint())
        
    shutil.move(model_dir, save_dir)
    print("Model exported to onnx and saved at location ", save_dir)

In [None]:
# export whisper-small
export_onnx("openai/whisper-small","/home/carol/mp/quantize/small-whisper")

In [None]:
#export whisper-tiny
export_onnx("openai/whisper-xxxxx","/home/carol/mp/quantize/xxxxxxxxx")

In [None]:
#export whisper-base
export_onnx("openai/whisper-base","/home/carol/mp/quantize/base")

# Dynamic Quantization - intel (symmetric weights and activations)

In [13]:
from pathlib import Path
def quantize_nor(model_dir, save_dir):
    
    # create list of onnx models from the directory
    onnx_models = list(Path(model_dir).glob("*.onnx"))

    # instantiate quantizer and set quantization configuration
    quantizers = [ORTQuantizer.from_pretrained(model_dir, file_name=onnx_model) for onnx_model in onnx_models]
    qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False,nodes_to_exclude=['/conv1/Conv', '/conv2/Conv'],use_symmetric_activations=True,  
        use_symmetric_weights=True,  
        operators_to_quantize=None )

    for quantizer in quantizers:
        # Apply dynamic quantization and save the resulting model
        quantizer.quantize(save_dir=save_dir, quantization_config=qconfig)
        print("quantized ", quantizer)
    print("Quantization complete")
    

In [14]:
quantize_nor("/home/carol/mp/quantize/small","/home/carol/mp/quantize/q-small")

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: /home/carol/mp/quantize/q-small (external data format: False)
Configuration saved in /home/carol/mp/quantize/q-small/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Spec

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7112c8faef20>


Quantizing model...
Saving quantized model at: /home/carol/mp/quantize/q-small (external data format: False)
Configuration saved in /home/carol/mp/quantize/q-small/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fi

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7111977374f0>


Quantizing model...
Saving quantized model at: /home/carol/mp/quantize/q-small (external data format: False)
Configuration saved in /home/carol/mp/quantize/q-small/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fi

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x711196f67a30>
Quantization complete


# Dynamic Quantization - arm64

In [6]:
from pathlib import Path
def quantize_arm(model_dir, save_dir):
    
    # create list of onnx models from the directory
    onnx_models = list(Path(model_dir).glob("*.onnx"))

    # instantiate quantizer and set quantization configuration
    quantizers = [ORTQuantizer.from_pretrained(model_dir, file_name=onnx_model) for onnx_model in onnx_models]
    qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False,nodes_to_exclude=['/conv1/Conv', '/conv2/Conv'],use_symmetric_activations=True,  
        use_symmetric_weights=True,  
        operators_to_quantize=None )

    for quantizer in quantizers:
        # Apply dynamic quantization and save the resulting model
        quantizer.quantize(save_dir=save_dir, quantization_config=qconfig)
        print("quantized ", quantizer)
    print("Quantization complete")

In [None]:
quantize_sym("/home/carol/mp/quantize/tiny","/home/carol/mp/quantize/tiny-sym")

In [None]:
d_quantize("/home/carol/mp/quantize/small-whisper","/home/carol/mp/quantize/q-small")

In [None]:
quantize_sym("/home/carol/mp/quantize/small","/home/carol/mp/quantize/small-sym")

In [9]:
quantize_arm("/home/carol/mp/quantize/small","/home/carol/mp/quantize/q-small-arm")

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: /home/carol/mp/quantize/q-small-arm (external data format: False)
Configuration saved in /home/carol/mp/quantize/q-small-arm/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or train

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7111961a99c0>


Quantizing model...
Saving quantized model at: /home/carol/mp/quantize/q-small-arm (external data format: False)
Configuration saved in /home/carol/mp/quantize/q-small-arm/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x711196ca6b30>


Quantizing model...
Saving quantized model at: /home/carol/mp/quantize/q-small-arm (external data format: False)
Configuration saved in /home/carol/mp/quantize/q-small-arm/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7111971db7c0>
Quantization complete






# Inference on ONNX models



In [1]:
import os
import librosa
import time
from transformers import WhisperProcessor, AutoConfig
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq

def run(model_name, audio):
    #define odel path
    model_path = os.path.join("/home/carol/mp/quantize", model_name)
    # Load the model and processor
    processor = WhisperProcessor.from_pretrained(model_name)
    model_config = AutoConfig.from_pretrained(model_name)
    sessions = ORTModelForSpeechSeq2Seq.load_model(
                os.path.join(model_path, 'encoder_model.onnx'),
                os.path.join(model_path, 'decoder_model.onnx'),
                os.path.join(model_path, 'decoder_with_past_model.onnx'))
    model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], model_config, model_path, sessions[2])

    # Load the audio file
    audio_data, sample_rate = librosa.load(audio, sr=16000, mono=True)

    # Preprocess the audio
    input_features = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features
    forced_decoder_ids = processor.get_decoder_prompt_ids(language="english",task="translate")
    
    # Measure the time taken for inference
    start_time = time.time()
    predicted_ids = model.generate(input_features,forced_decoder_ids=forced_decoder_ids)[0]
    # Generate transcription
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    inference_time = time.time() - start_time

     # model size
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(model_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    size = total_size / (1024 * 1024)  # Convert to MB

    print()
    print("Model name = ",model_name)
    print()
    print(transcription)
    print()
    print("Inference Time = ",inference_time)
    print("Model size = ",size) 
    

  from .autonotebook import tqdm as notebook_tqdm
2024-03-11 00:21:10.902292: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 00:21:10.902398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 00:21:10.981362: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-11 00:21:11.121377: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
run("q-small-arm","sample.wav")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Model name =  q-small-arm

 Harvard List number one. The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. It's easy to tell the depth of a well. These days a chicken leg is a rare dish.

Inference Time =  4.8225929737091064
Model size =  457.8302516937256


In [15]:
run("q-small","sample.wav")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Model name =  q-small

 Harvard List number one. The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. It's easy to tell the depth of a well. These days a chicken leg is a rare dish.

Inference Time =  4.68233060836792
Model size =  457.8302516937256


# Quantize-arm

In [None]:
quantize_arm("/home/carol/mp/quantize/tiny","/home/carol/mp/quantize/q-tiny-arm")

In [11]:
run("q-small-arm","sample.wav")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Model name =  q-small-arm

 Harvard List number one. The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. It's easy to tell the depth of a well. These days a chicken leg is a rare dish.

Inference Time =  4.75690221786499
Model size =  457.8302516937256


# EVALUATION

# runn() - with Real Time Factor (RTF)

In [None]:
import time

def runn(model_name, audio):
    # Define model path
    model_path = os.path.join("/home/carol/mp/quantize", model_name)
    
    # Load the model and processor
    processor = WhisperProcessor.from_pretrained(model_name)
    model_config = AutoConfig.from_pretrained(model_name)
    sessions = ORTModelForSpeechSeq2Seq.load_model(
                os.path.join(model_path, 'encoder_model.onnx'),
                os.path.join(model_path, 'decoder_model.onnx'),
                os.path.join(model_path, 'decoder_with_past_model.onnx'))
    model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], model_config, model_path, sessions[2])

    # Load the audio file
    audio_data, sample_rate = librosa.load(audio, sr=16000, mono=True)

    # Preprocess the audio
    start_time = time.time()
    input_features = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features
    
    # Get forced decoder prompt IDs
    forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="translate")

    # Perform model inference
    inference_start_time = time.time()
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)[0]
    inference_time = time.time() - inference_start_time

    # Decode the predicted IDs
    decoding_start_time = time.time()
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    decoding_time = time.time() - decoding_start_time

    # Calculate RTF
    read_time = time.time() - start_time
    total_utterance_duration = len(audio_data) / sample_rate
    rtf = (read_time + inference_time + decoding_time) / total_utterance_duration

    # Print transcription and RTF
    print( transcription)
    print("RTF:", rtf)
    print("Read time = ", read_time)
    print("Inference time = ", inference_time)
    print("Decoding time = ", decoding_time)
    print("Total utterance duration = ", total_utterance_duration)



In [None]:
runn("tiny","sample.wav")

# runnn() - with mean, avg, p75 and p90

In [None]:
import time
import numpy as np

def runnn(model_name, audio):
    rtfs = []
    transcriptions = []
    
    # Define model path
    model_path = os.path.join("/home/carol/mp/quantize", model_name)
    
    # Load the model and processor
    processor = WhisperProcessor.from_pretrained(model_name)
    model_config = AutoConfig.from_pretrained(model_name)
    sessions = ORTModelForSpeechSeq2Seq.load_model(
                os.path.join(model_path, 'encoder_model.onnx'),
                os.path.join(model_path, 'decoder_model.onnx'),
                os.path.join(model_path, 'decoder_with_past_model.onnx'))
    model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], model_config, model_path, sessions[2])

    # Load the audio file
    audio_data, sample_rate = librosa.load(audio, sr=16000, mono=True)

    # Preprocess the audio
    start_time = time.time()
    input_features = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features
    
    # Get forced decoder prompt IDs
    forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="translate")

    # Perform model inference
    inference_start_time = time.time()
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)[0]
    inference_time = time.time() - inference_start_time

    # Decode the predicted IDs
    decoding_start_time = time.time()
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    decoding_time = time.time() - decoding_start_time

    # Calculate RTF
    read_time = time.time() - start_time
    total_utterance_duration = len(audio_data) / sample_rate
    rtf = (read_time + inference_time + decoding_time) / total_utterance_duration

    # Print transcription and RTF
    print( transcription)
    print("RTF:", rtf)
    print("Read time = ", read_time)
    print("Inference time = ", inference_time)
    print("Decoding time = ", decoding_time)
    print("Total utterance duration = ", total_utterance_duration)

    # Append RTF and transcription to lists
    rtfs.append(rtf)
    transcriptions.append(transcription)

    # Convert RTF list to numpy array
    rtfs = np.array(rtfs)

    # Compute statistics
    avg_rtf = np.mean(rtfs)
    mean_rtf = np.mean(rtfs)
    pctl_75_rtf = np.percentile(rtfs, 75)
    pctl_90_rtf = np.percentile(rtfs, 90)

    # Print statistics
    print("Average RTF:", avg_rtf)
    print("Mean RTF:", mean_rtf)
    print("75th Percentile RTF:", pctl_75_rtf)
    print("90th Percentile RTF:", pctl_90_rtf)



In [None]:
runnn("tiny", "sample.wav")

In [None]:
fil
