In [15]:
import tensorflow as tf
from transformers import Wav2Vec2Processor, TFWav2Vec2ForCTC
from datasets import load_dataset
import soundfile as sf
import onnxruntime as rt
import tf2onnx
import numpy as np

In [2]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

2021-12-31 17:53:45.206338: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFWav2Vec2ForCTC.

All the layers of TFWav2Vec2ForCTC were initialized from the model checkpoint at facebook/wav2vec2-base-960h.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWav2Vec2ForCTC for predictions without further training.


In [3]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

In [4]:
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)

Reusing dataset librispeech_asr (/Users/andreasgyascok/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
Loading cached processed dataset at /Users/andreasgyascok/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-0aee31f7b335be94.arrow


In [5]:
input_values = processor(ds["speech"][1], return_tensors="tf").input_values  # Batch size 1

It is strongly recommended to pass the ``sampling_rate`` argument to this function.Failing to do so can result in silent errors that might be hard to debug.


In [6]:
input_values.shape

TensorShape([1, 104560])

In [7]:
logits = model(input_values).logits

In [8]:
predicted_ids = tf.argmax(logits, axis=-1)

In [30]:
predicted_ids[0].shape

TensorShape([326])

In [9]:
transcription = processor.decode(predicted_ids[0])
transcription

"SWEAT COVERED BRION'S BODY TRICKLING INTO THE TIGHT LOWING CLOTH THAT WAS THE ONLY GARMENT HE WORE"

In [10]:
model.summary()

Model: "tf_wav2vec2_for_ctc"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
wav2vec2 (TFWav2Vec2MainLaye multiple                  94371712  
_________________________________________________________________
dropout_50 (Dropout)         multiple                  0         
_________________________________________________________________
lm_head (Dense)              multiple                  24608     
Total params: 94,396,320
Trainable params: 94,396,320
Non-trainable params: 0
_________________________________________________________________


In [11]:
AUDIO_MAXLEN = 150000
ONNX_PATH = "onnx-wav2vec2-150k.onnx"

input_signature = (tf.TensorSpec((None, AUDIO_MAXLEN), tf.float32, name="speech"),)
_ = tf2onnx.convert.from_keras(model, input_signature=input_signature, output_path=ONNX_PATH)

2021-12-31 17:55:04.405611: I tensorflow/core/grappler/devices.cc:78] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2021-12-31 17:55:04.445949: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize
  function_optimizer: function_optimizer did nothing. time = 0.006ms.
  function_optimizer: function_optimizer did nothing. time = 0.001ms.



Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`


2021-12-31 17:55:24.342254: I tensorflow/core/grappler/devices.cc:78] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2021-12-31 17:55:30.773847: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1144] Optimization results for grappler item: graph_to_optimize
  constant_folding: Graph size after: 3179 nodes (-234), 4022 edges (-246), time = 3617.25903ms.
  function_optimizer: function_optimizer did nothing. time = 48.184ms.
  constant_folding: Graph size after: 3179 nodes (0), 4022 edges (0), time = 682.623ms.
  function_optimizer: function_optimizer did nothing. time = 53.344ms.



In [20]:
session = rt.InferenceSession(ONNX_PATH)
aspeech = np.array(ds["speech"][1], dtype=np.float32)
aspeech = np.expand_dims(aspeech, axis=0)
padding = np.zeros((aspeech.shape[0], AUDIO_MAXLEN - aspeech.shape[1]), dtype=np.float32)
speech = np.concatenate([aspeech, padding], axis=-1)
onnx_outputs = session.run(None, {"speech": speech})[0]

In [37]:
predicted = np.argmax(onnx_outputs, axis=-1)[0]
transcription = processor.decode(predicted_ids[0])
transcription

"SWEAT COVERED BRION'S BODY TRICKLING INTO THE TIGHT LOWING CLOTH THAT WAS THE ONLY GARMENT HE WORE"

## ONNX

In [None]:
!pip3 install -qU tf2onnx onnxruntime
!pip3 install -q git+https://github.com/vasudevgupta7/gsoc-wav2vec2@main

In [None]:
from wav2vec2 import Wav2Vec2ForCTC

model_id = "vasudevgupta/gsoc-wav2vec2-960h"
model = Wav2Vec2ForCTC.from_pretrained(model_id)

In [None]:
AUDIO_MAXLEN = 50000
ONNX_PATH = "onnx-wav2vec2.onnx"

In [None]:
input_signature = (tf.TensorSpec((None, AUDIO_MAXLEN), tf.float32, name="speech"),)
_ = tf2onnx.convert.from_keras(model, input_signature=input_signature, output_path=ONNX_PATH)

In [None]:
!wget https://github.com/vasudevgupta7/gsoc-wav2vec2/raw/main/data/sample.wav

In [None]:
from wav2vec2 import Wav2Vec2Processor

processor = Wav2Vec2Processor(is_tokenizer=False)

In [None]:
FILENAME = "sample.wav"

speech, _ = sf.read(FILENAME)
speech = tf.constant(speech, dtype=tf.float32)
speech = processor(speech)[None]

padding = tf.zeros((speech.shape[0], AUDIO_MAXLEN - speech.shape[1]))
speech = tf.concat([speech, padding], axis=-1)
speech.shape

In [None]:
session = rt.InferenceSession(ONNX_PATH)

In [None]:
@tf.function(jit_compile=True)
def jitted_forward(speech):
    return model(speech)

In [None]:
onnx_outputs = session.run(None, {"speech": speech.numpy()})[0]
tf_outputs = jitted_forward(speech)

assert np.allclose(onnx_outputs, tf_outputs.numpy(), atol=1e-2)

In [None]:
tokenizer = Wav2Vec2Processor(is_tokenizer=True)
prediction = np.argmax(onnx_outputs, axis=-1)
prediction = tokenizer.decode(prediction.squeeze().tolist())

In [None]:
from IPython.display import Audio
print("prediction:", prediction)
Audio(filename=FILENAME)

In [None]:
aspeech = np.array(ds["speech"][1], dtype=np.float32)
aspeech = np.expand_dims(aspeech, axis=0)[:,:50000]
onnx_outputs = session.run(None, {"speech": aspeech})[0]

In [None]:
prediction = np.argmax(onnx_outputs, axis=-1)
prediction = tokenizer.decode(prediction.squeeze().tolist())
prediction

In [None]:
speech.numpy().shape

In [None]:
aspeech = np.array(ds["speech"][1], dtype=np.float32)
aspeech = np.expand_dims(aspeech, axis=0)[:,:50000]
aspeech[:, 25000:50000] = 0


In [None]:
onnx_outputs = session.run(None, {"speech": aspeech})[0]
prediction = np.argmax(onnx_outputs, axis=-1)
prediction = tokenizer.decode(prediction.squeeze().tolist())
prediction

In [None]:
aspeech = np.array(ds["speech"][1], dtype=np.float32)
aspeech = np.expand_dims(aspeech, axis=0)[:,:40000]
onnx_outputs = session.run(None, {"speech": aspeech})[0]
prediction = np.argmax(onnx_outputs, axis=-1)
prediction = tokenizer.decode(prediction.squeeze().tolist())
prediction

In [None]:
sent = np.array(ds["speech"])

In [None]:
speech_length = []
for i in ds["speech"]:
    speech_length.append(len(i))
max(speech_length)