In [4]:
import numpy as np
import io
import librosa
import onnxruntime as ort
from transformers import Wav2Vec2FeatureExtractor

sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

ort.get_available_providers()

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']

In [2]:
processor = Wav2Vec2FeatureExtractor.from_pretrained("models/finals_audio_model")
model = ort.InferenceSession('model.onnx', sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])

In [35]:
id2label = {
    0: "angry_sad",
    1: "happy_neutral"}

def predict(filepath):
    with open(filepath, 'rb') as f:
        data = f.read()  # simulates the behavior of the TIL SDK of supplying bytes
    speech_array, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
    features = processor(speech_array, sampling_rate=16000, return_tensors="np")
    onnx_outputs = model.run(None, {model.get_inputs()[0].name: features.input_values})[0]
    print(onnx_outputs)
    prediction = np.argmax(onnx_outputs, axis=-1)
    return prediction.squeeze().tolist()


id2label[predict('data/finals_combined_singaporean/happy_neutral/aeaacae349.wav')]

[[-4.227799  4.726426]]


'happy_neutral'

In [19]:
with open('data/finals_combined_singaporean/happy_neutral/aeaacae349.wav', 'rb') as f:
    data = f.read()

speech_array, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
features = processor(speech_array, sampling_rate=16000, return_tensors="np")

array([ 0.00011178,  0.00182088,  0.00159959, ...,  0.00527226,
        0.0024491 , -0.00182516], dtype=float32)