In [32]:
!pip install openvino-dev[EXTRAS]
!pip install transformers==4.11.3
!pip install openvino



In [33]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [34]:
num_labels = 6 
emotions_list = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
class_to_idx = {'Angry': 0, 'Disgust': 1, 'Fear': 2, 'Happy': 3, 'Neutral': 4, 'Sad': 5, 'Surprise': 6}
idx_to_class = {0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Neutral', 5: 'Sad', 6: 'Surprise'}

In [35]:
import librosa
import torchaudio
import torch
import numpy as np
from transformers import AutoConfig, Wav2Vec2Processor, AutoModelForAudioClassification

#Pytorch

In [36]:
model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
pooling_mode = "mean"
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate

In [37]:
def preprocess_function_eval(speech_path):   
    speech_array, sampling_rate = librosa.load(speech_path, sr = 16000)
    result = processor(speech_array, sampling_rate=target_sampling_rate, max_length=50000, padding=True, truncation=True, return_attention_mask=True)
    len_of_input_data = result['input_values'][0].shape[0]
    padded_array = np.pad(result['input_values'][0], ((0,50000-len_of_input_data)), constant_values=0)
    return padded_array

In [38]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id=class_to_idx,
    id2label=idx_to_class,
    finetuning_task="wav2vec2_clf",    
)
setattr(config, 'pooling_mode', pooling_mode)

In [77]:
model = AutoModelForAudioClassification.from_pretrained(
    '/content/gdrive/MyDrive/group_course_work/checkpoint-1365', # <- insert folder with xml, config and bin
    num_labels=6,
    label2id=class_to_idx,
    id2label=idx_to_class,
) # acc 0.415144

404 Client Error: Not Found for url: https://huggingface.co//content/gdrive/MyDrive/group_course_work/checkpoint-1365/resolve/main/config.json


OSError: ignored

In [78]:
model(preprocess_function_eval(PATH_TO_WAV_FILE)) #outputs logits

NameError: ignored

In [79]:
!mo --input_model /content/gdrive/MyDrive/group_course_work/wav2vec2.onnx --data_type FP16 --static_shape #line to transform onnx to fp16 or fp32

Model Optimizer arguments:
Common parameters:
	- Path to the Input Model: 	/content/gdrive/MyDrive/group_course_work/wav2vec2.onnx
	- Path for generated IR: 	/content/.
	- IR output name: 	wav2vec2
	- Log level: 	ERROR
	- Batch: 	Not specified, inherited from the model
	- Input layers: 	Not specified, inherited from the model
	- Output layers: 	Not specified, inherited from the model
	- Input shapes: 	Not specified, inherited from the model
	- Source layout: 	Not specified
	- Target layout: 	Not specified
	- Layout: 	Not specified
	- Mean values: 	Not specified
	- Scale values: 	Not specified
	- Scale factor: 	Not specified
	- Precision of IR: 	FP16
	- Enable fusing: 	True
	- User transformations: 	Not specified
	- Reverse input channels: 	False
	- Enable IR generation for fixed input shape: 	True
	- Use the transformations config file: 	None
Advanced parameters:
	- Force the usage of legacy Frontend of Model Optimizer for model conversion into IR: 	False
	- Force the usage of new Fron

#OpenVino and ONNX

In [None]:
!python -c "from openvino.runtime import Core"

In [6]:
from openvino.runtime import Core

ie = Core()

In [7]:
devices = ie.available_devices

for device in devices:
    device_name = ie.get_property(device_name=device, name="FULL_DEVICE_NAME")
    print(f"{device}: {device_name}")

CPU:            Intel(R) Xeon(R) CPU @ 2.30GHz


### ONNX model

In [None]:
# input_data = torch.randn((1,50000))
input_data = preprocess_function_eval('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Angry/000149120.wav')

In [None]:
from openvino.runtime import Core

ie = Core()
onnx_model_path = "/content/gdrive/MyDrive/group_course_work/wav2vec2.onnx" #<- check onnx file using opnvino engine
model_onnx = ie.read_model(model=onnx_model_path)
model.reshape([1, 50000])
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")

In [None]:
result_onnx = compiled_model_onnx([input_data])

In [None]:
[*result_onnx.values()][0] #logits from onnx model

array([[ 0.21841261, -0.14379963, -0.06608591, -0.13532831,  0.28880668,
         0.02028291, -0.23602264]], dtype=float32)

#OpenVino models (fp32 and fp16)

In [39]:
# input_data = preprocess_function_eval('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Angry/000149120.wav')

In [80]:
from openvino.runtime import Core

ie = Core()
classification_model_xml = "/content/gdrive/MyDrive/group_course_work/fp16_w2v2/wav2vec2.xml" #<- insert here xml file where folder contains .bin weight file
model = ie.read_model(model=classification_model_xml)
model.reshape([1, 50000])
compiled_model = ie.compile_model(model=model, device_name="CPU") 
input_layer = compiled_model.input(0)
output_layer = compiled_model.output(0)

In [81]:
result = compiled_model([np.expand_dims(input_data, 0)])[output_layer]

In [82]:
result

array([[ 0.25195688, -0.11307321,  0.0417524 , -0.18132648,  0.0805621 ,
         0.06590223, -0.16228062]], dtype=float32)

In [43]:
import pathlib

In [44]:
emotions_path = [*pathlib.Path('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio').iterdir()][:7]

In [45]:
emotions_path

[PosixPath('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Sad'),
 PosixPath('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Happy'),
 PosixPath('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Surprise'),
 PosixPath('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Neutral'),
 PosixPath('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Angry'),
 PosixPath('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Disgust'),
 PosixPath('/content/gdrive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/Fear')]

In [46]:
all_val_paths = []
for i in emotions_path:
    for ii in i.iterdir():
      all_val_paths.append(ii)

In [47]:
len(all_val_paths)

383

In [83]:
predicted_values = []
for audio in all_val_paths:
  input_data = preprocess_function_eval(audio)
  result = np.argmax(compiled_model([np.expand_dims(input_data, 0)])[output_layer])
  predicted_values.append(result)

In [61]:
len(predicted_values)

383

In [58]:
val_labels_emotions = []
for i in emotions_path:
    for ii in i.iterdir():
      val_labels_emotions.append((str(ii.parent).split('/')[-1]))

In [62]:
val_labels_nums = []
for label_name in val_labels_emotions:
    val_labels_nums.append(class_to_idx.get(label_name))

In [84]:
len(val_labels_nums)

383

In [85]:
from sklearn.metrics import accuracy_score

In [86]:
accuracy_score(val_labels_nums, predicted_values)

0.412532637075718