In [8]:
!pip install openvino-dev[EXTRAS]
!pip install transformers==4.11.3
!pip install openvino



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
num_labels = 6 
emotions_list = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
class_to_idx = {'Angry': 0, 'Disgust': 1, 'Fear': 2, 'Happy': 3, 'Neutral': 4, 'Sad': 5, 'Surprise': 6}
idx_to_class = {0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Neutral', 5: 'Sad', 6: 'Surprise'}

In [10]:
import librosa
import torchaudio
import torch
from transformers import AutoConfig, Wav2Vec2Processor, AutoModelForAudioClassification

#Pytorch

In [5]:
model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
pooling_mode = "mean"
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/300 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

In [6]:
def preprocess_function_eval(speech_path):   
    speech_array, sampling_rate = librosa.load(speech_path, sr = 16000)
    result = processor(speech_array, sampling_rate=target_sampling_rate, max_length=50000, padding=True, truncation=True, return_attention_mask=True)
    return result

In [7]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id=class_to_idx,
    id2label=idx_to_class,
    finetuning_task="wav2vec2_clf",    
)
setattr(config, 'pooling_mode', pooling_mode)

In [9]:
model = AutoModelForAudioClassification.from_pretrained(
    '/content/gdrive/MyDrive/group_course_work/checkpoint-1365', # <- insert folder with xml, config and bin
    num_labels=6,
    label2id=class_to_idx,
    id2label=idx_to_class,
) # acc 0.415144

In [None]:
model(preprocess_function_eval(PATH_TO_WAV_FILE)) #outputs logits

In [18]:
# !mo --input_model /content/gdrive/MyDrive/group_course_work/wav2vec2.onnx --data_type FP32 --static_shape #line to transform onnx to fp16 or fp32

Model Optimizer arguments:
Common parameters:
	- Path to the Input Model: 	/content/gdrive/MyDrive/group_course_work/wav2vec2.onnx
	- Path for generated IR: 	/content/.
	- IR output name: 	wav2vec2
	- Log level: 	ERROR
	- Batch: 	Not specified, inherited from the model
	- Input layers: 	Not specified, inherited from the model
	- Output layers: 	Not specified, inherited from the model
	- Input shapes: 	Not specified, inherited from the model
	- Source layout: 	Not specified
	- Target layout: 	Not specified
	- Layout: 	Not specified
	- Mean values: 	Not specified
	- Scale values: 	Not specified
	- Scale factor: 	Not specified
	- Precision of IR: 	FP32
	- Enable fusing: 	True
	- User transformations: 	Not specified
	- Reverse input channels: 	False
	- Enable IR generation for fixed input shape: 	True
	- Use the transformations config file: 	None
Advanced parameters:
	- Force the usage of legacy Frontend of Model Optimizer for model conversion into IR: 	False
	- Force the usage of new Fron

#OpenVino and ONNX

In [11]:
!python -c "from openvino.runtime import Core"

In [12]:
from openvino.runtime import Core

ie = Core()

In [13]:
devices = ie.available_devices

for device in devices:
    device_name = ie.get_property(device_name=device, name="FULL_DEVICE_NAME")
    print(f"{device}: {device_name}")

CPU:            Intel(R) Xeon(R) CPU @ 2.20GHz


### ONNX model

In [14]:
input_data = torch.randn((1,50000))
# input_data = preprocess_function_eval(PATH_TO_WAV_FILE)

In [15]:
from openvino.runtime import Core

ie = Core()
onnx_model_path = "/content/gdrive/MyDrive/group_course_work/wav2vec2.onnx" #<- check onnx file using opnvino engine
model_onnx = ie.read_model(model=onnx_model_path)
model.reshape([1, 50000])
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")

In [16]:
result_onnx = compiled_model_onnx([input_data])

In [17]:
[*result_onnx.values()][0] #logits from onnx model

array([[ 0.21841261, -0.14379963, -0.06608591, -0.13532831,  0.28880668,
         0.02028291, -0.23602264]], dtype=float32)

#OpenVino models (fp32 and fp16)

In [18]:
from openvino.runtime import Core

ie = Core()
classification_model_xml = "/content/gdrive/MyDrive/group_course_work/fp32_w2v2/wav2vec2.xml" #<- insert here xml file where folder contains .bin weight file
model = ie.read_model(model=classification_model_xml)
model.reshape([1, 50000])
compiled_model = ie.compile_model(model=model, device_name="CPU") 
input_layer = compiled_model.input(0)
output_layer = compiled_model.output(0)

In [19]:
result = compiled_model([input_data])[output_layer]

In [20]:
result

array([[ 0.2184112 , -0.14379936, -0.06608529, -0.13532735,  0.28880727,
         0.02028335, -0.23602293]], dtype=float32)