In [None]:
!pip install transformers
!pip install torch
!pip install pydub
!pip install numpy
!apt-get install git-lfs
!git lfs install
!pip install librosa

In [None]:
!git clone https://github.com/c4sh4/vk_voice_local

In [None]:
ls

In [4]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from google.colab import drive
from pydub import AudioSegment
from IPython.display import Audio
from dataclasses import dataclass
from typing import Optional, Tuple
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.file_utils import ModelOutput
from transformers import Wav2Vec2Processor, AutoModelForAudioClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# my repo
from vk_voice_local.ravdess_fast_open import create_RAVDESS_df_with_labels

In [None]:
drive.mount(r'/content/drive', force_remount=True) # my google disc

In [None]:
df = create_RAVDESS_df_with_labels('drive/MyDrive/vk_voice/datasets/RAVDESS', 16000)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Wav2Vec2Processor.from_pretrained('patrickvonplaten/wavlm-libri-clean-100h-base-plus')
model = AutoModelForAudioClassification.from_pretrained("Zahra99/wavlm-large-finetuned-iemocap")

In [8]:
cpnfig_ = model.config

In [None]:
print("Классы, которые есть в Zahra99/wavlm-large-finetuned-iemocap: ")
for class_id, class_label in cpnfig_.id2label.items():
    print(f"Class ID: {class_id}, Label: {class_label}")

WavLM, предобученный для SER на AIMOCAP имеет 4 класса:  
* 0) Angry
* 1) happy
* 2) neutral
* 3) sad

Классы RAVDESS:

* 01 = neutral,   
* 02 = calm,  
* 03 = happy,   
* 04 = sad,   
* 05 = angry,   
* 06 = fearful,   
* 07 = disgust,   
* 08 = surprised  


In [15]:
def predict_emotion_wavlm(audio_np_array, processor, model, device, sampling_rate = 16000):
    model.to(device)
    # processor
    input_values = processor(audio_np_array, return_tensors="pt", sampling_rate=sampling_rate).input_values
    input_values = input_values.to(device)

    # prediciton
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_class_id = torch.argmax(logits, dim=-1)
    # return predicted_class_id.item()
    return model.config.id2label[predicted_class_id.item()]

In [31]:
df_test = df[df['label'].isin([1,3,4,5])].copy()

In [32]:
print(df_test)

                                                  audio label
4     [9.734966e-06, 4.337436e-07, -4.2628585e-07, 4...     1
5     [-5.684342e-14, -1.7053026e-13, 0.0, 3.410605e...     1
6     [-4.973799e-14, -2.629008e-13, -1.9895197e-13,...     1
7     [1.6705712e-07, 2.1253032e-05, 2.0601543e-05, ...     1
8     [-3.0412157e-06, -1.0787139e-05, -1.6587239e-0...     1
...                                                 ...   ...
1419  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...     5
1420  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...     5
1421  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...     5
1422  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...     5
1423  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...     5

[768 rows x 2 columns]


In [39]:
predicted_labels = []
true_labels = df_test['label'].tolist()

for index, row in df_test.iterrows():
    audio_data = row['audio']
    predicted_label = predict_emotion_wavlm(audio_data, processor, model, device, 16000)
    predicted_labels.append(predicted_label)

In [40]:
label_mapping = {
    "ang": 5,  #
    "hap": 3,  #
    "neu": 1,  #
    "sad": 4,  #
}

In [41]:
class_names = ["angry", "happy",  "neutral", "sad"]

In [42]:
mapped_predicted_labels = np.array([label_mapping[pred] for pred in predicted_labels])
mapped_predicted_labels[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 5,
       5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5,
       5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 3, 1, 3, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 3, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5,

In [43]:
accuracy = accuracy_score(true_labels, mapped_predicted_labels)
f1_weighted = f1_score(true_labels, mapped_predicted_labels, average='weighted')
precision_weighted = precision_score(true_labels, mapped_predicted_labels, average='weighted', zero_division=0)
recall_weighted = recall_score(true_labels, mapped_predicted_labels, average='weighted')

f1_per_class = f1_score(true_labels, mapped_predicted_labels, average=None)
precision_per_class = precision_score(true_labels, mapped_predicted_labels, average=None, zero_division=0)
recall_per_class = recall_score(true_labels, mapped_predicted_labels, average=None)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1_weighted}")
print(f"Precision: {precision_weighted}")
print(f"Recall: {recall_weighted}")

print("\nMetrics per class:")
for class_idx, (f1, prec, rec) in enumerate(zip(f1_per_class, precision_per_class, recall_per_class)):
    print(f"Class {class_names[class_idx]} - F1: {f1}, Precision: {prec}, Recall: {rec}")
    # print(class_idx)

Accuracy: 0.3854166666666667
F1 Score: 0.27718252959331885
Precision: 0.4317479537433356
Recall: 0.3854166666666667

Metrics per class:
Class angry - F1: 0.6063829787234043, Precision: 0.6195652173913043, Recall: 0.59375
Class happy - F1: 0.04060913705583756, Precision: 0.8, Recall: 0.020833333333333332
Class neutral - F1: 0.0, Precision: 0.0, Recall: 0.0
Class sad - F1: 0.46173800259403375, Precision: 0.307426597582038, Recall: 0.9270833333333334
