In [None]:
!pip install -U optimum[exporters,onnxruntime] transformers torch

In [35]:
from pathlib import Path
from datetime import datetime
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoFeatureExtractor,
    WhisperTokenizerFast,
    WhisperFeatureExtractor
)
from optimum.onnxruntime import (
    AutoQuantizationConfig,
    ORTModelForSpeechSeq2Seq,
    ORTQuantizer
)

In [30]:

def quantize_model(model_id, save_dir):
    
    # Export model in ONNX
    model = ORTModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)
    model_dir = model.model_save_dir
    print("Exported the model in ONNX")
    
    # Run quantization for all ONNX files of exported model
    onnx_models = list(Path(model_dir).glob("*.onnx"))
    
    
    quantizers = [ORTQuantizer.from_pretrained(model_dir, file_name=onnx_model) for onnx_model in onnx_models]
    qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False,nodes_to_exclude=['/conv1/Conv', '/conv2/Conv'])
    
    for quantizer in quantizers:
        # Apply dynamic quantization and save the resulting model
        quantizer.quantize(save_dir=save_dir, quantization_config=qconfig)
        print("quantized ", quantizer)
    print("Quantization complete")

In [36]:
def compare_time():

    # Measure inference of quantized model
    start_quantized = datetime.now()
    # for i in range(num_inferences):
    quantized_pipe(inference_file)
    end_quantized = datetime.now()
    # print(end_quantized)
    
    #Measure inference of original model
    start_original = datetime.now()
    # for i in range(num_inferences):
    original_pipe(inference_file)
    end_original = datetime.now()
    # print(end_original)
    
    
    original_inference_time = (end_original - start_original).total_seconds() / num_inferences
    print(f"Original inference time: {original_inference_time}")
    
    quantized_inference_time = (end_quantized - start_quantized).total_seconds() / num_inferences
    print(f"Quantized inference time: {quantized_inference_time}")

# Quantizing Tiny Whisper

In [3]:
# Configure base model and save directory for compressed model
model_id = "openai/whisper-tiny"  #tiny whisper
save_dir = "tiny_quantized"

# quantize_model(model_id, save_dir)

# Quantizing Small Whisper

In [32]:
# Configure base model and save directory for compressed model
model_id = "openai/whisper-small"  #tiny whisper
save_dir = "small_quantized"

quantize_model(model_id, save_dir)

Framework not specified. Using pt to export the model.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Speci

Exported the model in ONNX


Quantizing model...
Saving quantized model at: small_quantized (external data format: False)
Configuration saved in small_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Creating dy

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8ff23010>


Quantizing model...
Saving quantized model at: small_quantized (external data format: False)
Configuration saved in small_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Creating dy

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8ff22a40>


Quantizing model...
Saving quantized model at: small_quantized (external data format: False)
Configuration saved in small_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8ff23fa0>
Quantization complete


# Quantizing tiny.en

In [33]:
# Configure base model and save directory for compressed model
model_id = "openai/whisper-tiny.en"  #tiny whisper
save_dir = "tinyen_quantized"

quantize_model(model_id, save_dir)

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

Framework not specified. Using pt to export the model.


model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Using framework PyTorch: 2.2.1+cu121
Overriding 1 configuration item(s)
	- use_cache -> False
  if input_features.shape[-1] != expected_seq_length:
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt

Exported the model in ONNX


Quantizing model...
Saving quantized model at: tinyen_quantized (external data format: False)
Configuration saved in tinyen_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tu

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8c031300>


Quantizing model...
Saving quantized model at: tinyen_quantized (external data format: False)
Configuration saved in tinyen_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tu

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8c032500>


Quantizing model...
Saving quantized model at: tinyen_quantized (external data format: False)
Configuration saved in tinyen_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tu

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8c031360>
Quantization complete


# Quantizing Small.en

In [45]:
# Configure base model and save directory for compressed model
model_id = "openai/whisper-small.en"  #tiny whisper
save_dir = "smallen_quantized"

quantize_model(model_id, save_dir)

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

Framework not specified. Using pt to export the model.


model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Using framework PyTorch: 2.2.1+cu121
Overriding 1 configuration item(s)
	- use_cache -> False
  if input_features.shape[-1] != expected_seq_length:
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt

Exported the model in ONNX


Quantizing model...
Saving quantized model at: smallen_quantized (external data format: False)
Configuration saved in smallen_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e9602ba60>


Quantizing model...
Saving quantized model at: smallen_quantized (external data format: False)
Configuration saved in smallen_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8ec7bfd0>


Quantizing model...
Saving quantized model at: smallen_quantized (external data format: False)
Configuration saved in smallen_quantized/ort_config.json
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361], 'begin_suppress_tokens': [220, 50256]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-

quantized  <optimum.onnxruntime.quantization.ORTQuantizer object at 0x7c2e8ee2c430>
Quantization complete


# Compare original and quantized model - tiny whisper

In [37]:
# Number of inferences for comparing timings
# num_inferences = 4
# save_dir = "whisper-tiny"
inference_file = "30.wav"


In [10]:
model_name = 'tiny_quantized' # folder name
model = ORTModelForSpeechSeq2Seq.from_pretrained(model_name, export=False)
tokenizer = WhisperTokenizerFast.from_pretrained(model_name)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)


quantized_pipe = pipeline('automatic-speech-recognition', 
                model=model, 
                tokenizer=tokenizer, 
                feature_extractor=feature_extractor,
                return_timestamps=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Generation config file not found, using a generation config created from the model config.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Create pipeline with original model as baseline
original_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [29]:
for i in range(5):
    print("Iteration ", i+1)
    compare_time()
    print()

Iteration  1
Original inference time: 0.31065325
Quantized inference time: 0.21691725

Iteration  2
Original inference time: 0.3087925
Quantized inference time: 0.175048

Iteration  3
Original inference time: 0.235015
Quantized inference time: 0.28209425

Iteration  4
Original inference time: 0.33758025
Quantized inference time: 0.20556

Iteration  5
Original inference time: 0.2386685
Quantized inference time: 0.2481165



# Compare original and quantized model - tiny.en

In [38]:
model_name = 'tinyen_quantized' # folder name
model = ORTModelForSpeechSeq2Seq.from_pretrained(model_name, export=False)
tokenizer = WhisperTokenizerFast.from_pretrained(model_name)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)


quantized_pipe = pipeline('automatic-speech-recognition', 
                model=model, 
                tokenizer=tokenizer, 
                feature_extractor=feature_extractor,
                return_timestamps=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Generation config file not found, using a generation config created from the model config.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [39]:
# Create pipeline with original model as baseline
original_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")


generation_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

In [40]:
for i in range(5):
    print("Iteration ", i+1)
    compare_time()
    print()

Iteration  1
Original inference time: 0.23535125
Quantized inference time: 0.2650155

Iteration  2
Original inference time: 0.2246255
Quantized inference time: 0.19538

Iteration  3
Original inference time: 0.26350375
Quantized inference time: 0.194812

Iteration  4
Original inference time: 0.230364
Quantized inference time: 0.26628375

Iteration  5
Original inference time: 0.24502225
Quantized inference time: 0.22437825



# Compare original and quantized model - small

In [42]:
model_name = 'small_quantized' # folder name
model = ORTModelForSpeechSeq2Seq.from_pretrained(model_name, export=False)
tokenizer = WhisperTokenizerFast.from_pretrained(model_name)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)


quantized_pipe = pipeline('automatic-speech-recognition', 
                model=model, 
                tokenizer=tokenizer, 
                feature_extractor=feature_extractor,
                return_timestamps=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Generation config file not found, using a generation config created from the model config.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
# Create pipeline with original model as baseline
original_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [44]:
for i in range(5):
    print("Iteration ", i+1)
    compare_time()
    print()

Iteration  1
Original inference time: 1.469625
Quantized inference time: 2.5000235

Iteration  2
Original inference time: 1.5378675
Quantized inference time: 1.218381

Iteration  3
Original inference time: 1.41255425
Quantized inference time: 1.2871545

Iteration  4
Original inference time: 1.4106405
Quantized inference time: 1.467281

Iteration  5
Original inference time: 1.46851925
Quantized inference time: 1.1491705

