In [1]:
!pip install transformers
!pip install torch
!pip install pydub
!pip install numpy
!apt-get install git-lfs
!git lfs install
!pip install librosa

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.
Git LFS initialized.


In [2]:
!git clone https://github.com/c4sh4/vk_voice_local

Cloning into 'vk_voice_local'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 12 (delta 2), reused 8 (delta 1), pack-reused 0[K
Receiving objects: 100% (12/12), 95.86 KiB | 691.00 KiB/s, done.
Resolving deltas: 100% (2/2), done.


In [3]:
ls

[0m[01;34msample_data[0m/  [01;34mvk_voice_local[0m/


In [4]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from google.colab import drive
from IPython.display import Audio
from dataclasses import dataclass
from typing import Optional, Tuple
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.file_utils import ModelOutput
from transformers import  Wav2Vec2FeatureExtractor, AutoConfig
from transformers.models.hubert.modeling_hubert import (
    HubertPreTrainedModel,
    HubertModel
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# my repo
from vk_voice_local.ravdess_fast_open import create_RAVDESS_df_with_labels

In [5]:
drive.mount(r'/content/drive', force_remount=True) # my google disc

Mounted at /content/drive


In [6]:
df = create_RAVDESS_df_with_labels('drive/MyDrive/vk_voice/datasets/RAVDESS', 16000)

100%|██████████| 25/25 [14:59<00:00, 35.96s/it]



---


In [7]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


class HubertClassificationHead(nn.Module):
    """Head for hubert classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class HubertForSpeechClassification(HubertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.hubert = HubertModel(config)
        self.classifier = HubertClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.hubert.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.hubert(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

---


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = HubertForSpeechClassification.from_pretrained("Rajaram1996/Hubert_emotion")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of the model checkpoint at Rajaram1996/Hubert_emotion were not used when initializing HubertForSpeechClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSpeechClassification were not initialized from the model checkpoint at Rajaram1996/Hubert_emotion and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight

preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

In [9]:
model = model.to(device)

In [10]:
# Загрузка конфигурации модели
config = AutoConfig.from_pretrained("Rajaram1996/Hubert_emotion")

class_labels = config.id2label

print("Классы, которые есть в Rajaram1996/Hubert_emotion: ")
for class_id, class_label in class_labels.items():
    print(f"Class ID: {class_id}, Label: {class_label}")

Классы, которые есть в Rajaram1996/Hubert_emotion: 
Class ID: 0, Label: female_angry
Class ID: 1, Label: female_disgust
Class ID: 2, Label: female_fear
Class ID: 3, Label: female_happy
Class ID: 4, Label: female_neutral
Class ID: 5, Label: female_sad
Class ID: 6, Label: female_surprise
Class ID: 7, Label: male_angry
Class ID: 8, Label: male_disgust
Class ID: 9, Label: male_fear
Class ID: 10, Label: male_happy
Class ID: 11, Label: male_neutral
Class ID: 12, Label: male_sad
Class ID: 13, Label: male_surprise


Для теста с RAVDESS, выделим:  
  * 0,7 - angry   
  * 1,8 - disgust  
  * 2,9 - fear  
  * 3,10 - happy   
  * 4,11 - neutral   
  * 5,12 - sad  
  * 6,13 - surprise   
  
RAVDESS:    
  [neutral 4, happy 3, sad 5, angry 0, fearful 2, disgust 1, surprised 6]  
    0, 7 == 4  
    1, 8 == 6  
    2, 9 == 5  
    3, 10 == 2  
    4, 11 == 0  
    5, 12 == 3  
    6, 13 == 7  


In [11]:
df[df['label'] == 3].iloc[0].audio

Audio(data=df[df['label'] == 3].iloc[18].audio, rate=16000)

In [12]:
df.iloc[:10]

Unnamed: 0,audio,label
0,"[1.4078637e-07, -1.9284349e-07, 2.512112e-07, ...",0
1,"[-4.1346375e-06, -8.396215e-06, 1.9137656e-06,...",0
2,"[1.44035e-05, 2.7266444e-05, 3.2287273e-05, 2....",0
3,"[2.4156414e-05, 2.0974234e-05, -1.2405399e-06,...",0
4,"[9.734966e-06, 4.337436e-07, -4.2628585e-07, 4...",1
5,"[-5.684342e-14, -1.7053026e-13, 0.0, 3.410605e...",1
6,"[-4.973799e-14, -2.629008e-13, -1.9895197e-13,...",1
7,"[1.6705712e-07, 2.1253032e-05, 2.0601543e-05, ...",1
8,"[-3.0412157e-06, -1.0787139e-05, -1.6587239e-0...",1
9,"[-2.842171e-14, -4.405365e-13, 5.684342e-14, 0...",1


In [13]:
df_test = df[df['label'].isin([0,2,3,4,5,6,7])].copy()

In [14]:
def predict_emotion_hubert(sound_array, sampling_rate):
    inputs = feature_extractor(sound_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    predicted_label = np.argmax(scores)
    return config.id2label[predicted_label]

In [15]:
predicted_labels = []
true_labels = df_test['label'].tolist()

for index, row in df_test.iterrows():
    audio_data = row['audio']
    predicted_label = predict_emotion_hubert(audio_data, 16000)
    predicted_labels.append(predicted_label)

In [16]:
print(true_labels[3], predicted_labels[3])

0 male_neutral


In [17]:
label_mapping = {
    "female_angry": 4,  # female_angry
    "male_angry": 4,  # male_angry
    "female_disgust": 6,  # female_disgust
    "male_disgust": 6,  # male_disgust
    "female_fear": 5,  # female_fear
    "male_fear": 5,  # male_fear
    "female_happy": 2,  # female_happy
    "male_happy": 2, # male_happy
    "female_neutral": 0,  # female_neutral
    "male_neutral": 0, # male_neutral
    "female_sad": 3,  # female_sad
    "male_sad": 3, # male_sad
    "female_surprise": 7,  # female_surprise
    "male_surprise": 7  # male_surprise
}

In [18]:
class_names = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]

In [19]:
mapped_predicted_labels = np.array([label_mapping[pred] for pred in predicted_labels])
mapped_predicted_labels

array([0, 0, 0, ..., 7, 7, 7])

In [20]:
accuracy = accuracy_score(true_labels, mapped_predicted_labels)
f1_weighted = f1_score(true_labels, mapped_predicted_labels, average='weighted')
precision_weighted = precision_score(true_labels, mapped_predicted_labels, average='weighted', zero_division=0)
recall_weighted = recall_score(true_labels, mapped_predicted_labels, average='weighted')

f1_per_class = f1_score(true_labels, mapped_predicted_labels, average=None)
precision_per_class = precision_score(true_labels, mapped_predicted_labels, average=None, zero_division=0)
recall_per_class = recall_score(true_labels, mapped_predicted_labels, average=None)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1_weighted}")
print(f"Precision: {precision_weighted}")
print(f"Recall: {recall_weighted}")

print("\nMetrics per class:")
for class_idx, (f1, prec, rec) in enumerate(zip(f1_per_class, precision_per_class, recall_per_class)):
    print(f"Class {class_names[class_idx]} - F1: {f1}, Precision: {prec}, Recall: {rec}")

Accuracy: 0.9302884615384616
F1 Score: 0.9310677783960717
Precision: 0.9357074643657416
Recall: 0.9302884615384616

Metrics per class:
Class neutral - F1: 0.8571428571428571, Precision: 0.75, Recall: 1.0
Class happy - F1: 0.9424083769633508, Precision: 0.9473684210526315, Recall: 0.9375
Class sad - F1: 0.8601583113456465, Precision: 0.8716577540106952, Recall: 0.8489583333333334
Class angry - F1: 0.9637305699481865, Precision: 0.9587628865979382, Recall: 0.96875
Class fearful - F1: 0.9146005509641874, Precision: 0.9707602339181286, Recall: 0.8645833333333334
Class disgust - F1: 0.9814323607427056, Precision: 1.0, Recall: 0.9635416666666666
Class surprised - F1: 0.961038961038961, Precision: 0.9585492227979274, Recall: 0.9635416666666666
