# Wav2Vec2 (Speech Recognition) optimized to ONNX
**SageMaker Studio Kernel**: Data Science

In this tutorial you learn how to convert a TF/Hugging Face transformer for Speech Recognition to ONNX. ONNX is an important optimization tool that allows you to run ML models everywhere (cloud, on-prem or at the edge) with good performance. These are the steps of this tutorial:
   - Install some required libraries
   - Compile the model with ONNX and then quantize the model
   - Run a benchmark to see how the performance was improved

## 1) Install required libraries

In [None]:
!apt-get update -y && apt install -y build-essential python-soundfile
%pip install tensorflow==2.5 onnx==1.7 onnxruntime==1.9.0 soundfile
%pip install -q git+https://github.com/vasudevgupta7/gsoc-wav2vec2@main

## 2) Run a SageMaker Processing Job to convert the TF2.5 model to ONNX 1.7
Convert Tensorflow model to ONNX format via SageMaker TensorFlowProcessor

In [None]:
!mkdir -p code

In [None]:
%%writefile code/processing-script.py
import tensorflow as tf
import onnx
import os
import shutil
import tf2onnx

from wav2vec2 import Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor(is_tokenizer=False)
tokenizer = Wav2Vec2Processor(is_tokenizer=True)
model = Wav2Vec2ForCTC.from_pretrained("vasudevgupta/finetuned-wav2vec2-960h")

AUDIO_MAXLEN = 50000
ONNX_PATH = "wav2vec2.onnx"

# ONNX graph is described following the specifications of the opset number

input_signature = (tf.TensorSpec((None, AUDIO_MAXLEN), tf.float32, name="speech"),)
model = tf2onnx.convert.from_keras(model, input_signature=input_signature, output_path=ONNX_PATH)

onnx.save(model[0],'/opt/ml/processing/output/wav2vec2.onnx')

In [None]:
%%writefile code/requirements.txt
tensorflow==2.5
tf2onnx==1.11.1
onnx==1.7
numpy==1.19.2
git+https://github.com/vasudevgupta7/gsoc-wav2vec2@main

In [None]:
import sagemaker
import boto3

from sagemaker.tensorflow import TensorFlowProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role


sagemaker_session = sagemaker.Session()
BUCKET = sagemaker_session.default_bucket()

region = boto3.session.Session().region_name

S3_EXP_PATH = 'TF2ONNX'

tp = TensorFlowProcessor(
    framework_version='2.5',
    role=get_execution_role(),
    instance_type='ml.m5.xlarge',
    instance_count=1,
    base_job_name='ml-edge-tf25',
    image_uri=f'763104351884.dkr.ecr.{region}.amazonaws.com/tensorflow-training:2.5.0-cpu-py37-ubuntu18.04-v1.0',
    py_version='py37',
)

In [None]:
#Run the processing job
tp.run(
    code='processing-script.py',
    source_dir='code',
    outputs=[
        ProcessingOutput(
            output_name='wav2vec.onnx',
            source='/opt/ml/processing/output',
            destination=f's3://{BUCKET}/{S3_EXP_PATH}/output'
        )
    ]
)

### Download the ONNX model file. 
Also, let's download an audio sample

In [None]:
import os
import urllib

sagemaker_session.download_data('model', bucket = BUCKET,key_prefix=f'{S3_EXP_PATH}/output')

if not os.path.isfile('sample.wav'):
    urllib.request.urlretrieve('https://github.com/vasudevgupta7/gsoc-wav2vec2/raw/main/data/sample.wav', 'sample.wav')

### Load the audio sample and build a tensor (padding)

In [None]:
import soundfile as sf

AUDIO_MAXLEN = 50000
FILENAME = "sample.wav"

speech, _ = sf.read(FILENAME)
speech = tf.constant(speech, dtype=tf.float32)
speech = processor(speech)[None]

padding = tf.zeros((speech.shape[0], AUDIO_MAXLEN - speech.shape[1]))
speech = tf.concat([speech, padding], axis=-1)
speech.shape

## 3) Benchmark on CPU - TF vs ONNX vs ONNX quantized

In [None]:
import time
from contextlib import contextmanager

@contextmanager
def timeit(prefix="Time taken:", iterations=1):
    start = time.time()
    yield
    time_taken = (time.time() - start)/iterations
    print(prefix, time_taken, "seconds")

### Quantize ONNX model first

In [None]:
import onnx
from onnxruntime.quantization import quantize_dynamic

# https://github.com/microsoft/onnxruntime/issues/3130
def quantize_onnx_model(onnx_model_path, quantized_model_path):
    onnx_opt_model = onnx.load(onnx_model_path)
    quantize_dynamic(
        onnx_model_path,
        quantized_model_path,
         #nodes_to_exclude=['ConvInteger*','MatMulInteger*'],
         #nodes_to_quantize=[],
         weight_type=QuantType.QUInt8,
         #extra_options={"WeightSymmetric": False, "MatMulConstBOnly": True}
    )   
    print(f"quantized model saved to:{quantized_model_path}")
quantize_onnx_model("./model/wav2vec2.onnx", "./model/wav2vec2_quant.onnx")
!du -sh model/*.onnx

### Load the original TF model

In [None]:
import tensorflow as tf
from wav2vec2 import Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor(is_tokenizer=False)
tokenizer = Wav2Vec2Processor(is_tokenizer=True)
model_tf = Wav2Vec2ForCTC.from_pretrained("vasudevgupta/finetuned-wav2vec2-960h")

### Load ONNX & ONNX quantized models

In [None]:
import onnxruntime as rt

model_quant_onnx= "./model/wav2vec2_quant.onnx"
model_onnx= "./model/wav2vec2.onnx"

session_qt = rt.InferenceSession(model_quant_onnx)
session_x = rt.InferenceSession(model_onnx)

### Warmup models

In [None]:
y1,y2,y3 = model_tf(speech), session_x.run(None, {"speech": speech.numpy()})[0], session_qt.run(None, {"speech": speech.numpy()})[0]

### Benchmark all three models

In [None]:
iterations=10
with timeit(prefix="TF 2.5 model - time take:", iterations=iterations):
    [model_tf(speech) for i in range(iterations)]
    
with timeit(prefix="ONNX time taken:", iterations=iterations):
    [session_x.run(None, {"speech": speech.numpy()})[0] for i in range(iterations)]

with timeit(prefix="ONNX quantized time taken:", iterations=iterations):
    [session_qt.run(None, {"speech": speech.numpy()})[0] for i in range(iterations)]

### Check output tensors

In [None]:
import numpy as np

tf_outputs = model_tf(speech)
onnx_outputs = session_x.run(None, {"speech": speech.numpy()})[0]
onnx_quant_outputs = session_qt.run(None, {"speech": speech.numpy()})[0]

assert np.allclose(tf_outputs, onnx_outputs, atol=1e-2)
assert np.allclose(tf_outputs, onnx_quant_outputs, atol=1e-2) # error ~= 15

### Check transcriptions

In [None]:
import numpy as np
from IPython.display import Audio
tokenizer = Wav2Vec2Processor(is_tokenizer=True)

#### TF Model

In [None]:
prediction = np.argmax(tf_outputs, axis=-1)
prediction = tokenizer.decode(prediction.squeeze().tolist())
print("prediction from TF:", prediction)
Audio(filename=FILENAME)

##### ONNX Transcription

In [None]:
prediction = np.argmax(onnx_outputs, axis=-1)
prediction = tokenizer.decode(prediction.squeeze().tolist())
print("prediction from ONNX:", prediction)
Audio(filename=FILENAME)

##### ONNX Quantized Transcription

In [None]:
prediction = np.argmax(onnx_quant_outputs, axis=-1)
prediction = tokenizer.decode(prediction.squeeze().tolist())
print("prediction from Quantized ONNX:", prediction)
Audio(filename=FILENAME)