In [2]:
# References:
# https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb
# https://github.com/cristinalunaj/MMEmotionRecognition

import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

In [3]:
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

import os
os.environ['WANDB_DISABLED'] = 'true'

Collecting git+https://github.com/huggingface/datasets.git
  Cloning https://github.com/huggingface/datasets.git to /tmp/pip-req-build-2roguown
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/datasets.git /tmp/pip-req-build-2roguown
  Resolved https://github.com/huggingface/datasets.git to commit 90b1d94ef419cb26f0bb24d982897dca39aa8a46
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: datasets
  Building wheel for datasets (pyproject.toml) ... [?25ldone
[?25h  Created wheel for datasets: filename=datasets-2.21.1.dev0-py3-none-any.whl size=505366 sha256=94ad9764c434e2335352d637130fcf4b9bc1c7539bcd7be917aa1ae3f5652e0c
  Stored in directory: /tmp/pip-ephem-wheel-cache-ozeg7s74/wheels/57/f4/c4/53c677af89fec0ef3226c1e75a38367b37c2fa626f0544d3e4
Successfully built datasets
Installing collected packa

In [4]:
def prepare_RAVDESS_DS(path_audios):
    """
    Generation of the dataframe with the information of the dataset. The dataframe has the following structure:
     ______________________________________________________________________________________________________________________________
    |             name            |                     path                                   |     emotion      |     actor     |
    ______________________________________________________________________________________________________________________________
    |  01-01-01-01-01-01-01.wav   |    <RAVDESS_dir>/audios_16kHz/01-01-01-01-01-01-01.wav     |     Neutral      |     1         |
    ______________________________________________________________________________________________________________________________
    ...

    :param path_audios: Path to the folder that contains all the audios in .wav format, 16kHz and single-channel(mono)
    """
    dict_emotions_ravdess = {
        0: 'Neutral',
        1: 'Calm',
        2: 'Happy',
        3: 'Sad',
        4: 'Angry',
        5: 'Fear',
        6: 'Disgust',
        7: 'Surprise'
    }
    data = []
    for path in tqdm(Path(path_audios).glob("**/*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        label = dict_emotions_ravdess[int(name.split("-")[2]) - 1]  # Start emotions in 0
        actor = int(name.split("-")[-1])

        try:
            data.append({
                "name": name,
                "path": path,
                "emotion": label,
                "actor": actor
            })
        except Exception as e:
            # print(str(path), e)
            pass
    df = pd.DataFrame(data)
    return df


# running

DATASET_PATH = "/kaggle/input/ravdess-emotional-speech-audio"

df = prepare_RAVDESS_DS(DATASET_PATH)
df.head()

2880it [00:01, 2424.83it/s]


Unnamed: 0,name,path,emotion,actor
0,03-01-08-01-01-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...,Surprise,2
1,03-01-01-01-01-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...,Neutral,2
2,03-01-07-02-01-02-02,/kaggle/input/ravdess-emotional-speech-audio/A...,Disgust,2
3,03-01-07-01-01-02-02,/kaggle/input/ravdess-emotional-speech-audio/A...,Disgust,2
4,03-01-01-01-02-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...,Neutral,2


In [5]:
# Looking at the labels in the dataset with what distribution

print("Labels: ", df["emotion"].unique())
print()
df.groupby("emotion").count()[["path"]]

Labels:  ['Surprise' 'Neutral' 'Disgust' 'Fear' 'Sad' 'Calm' 'Happy' 'Angry']



Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
Angry,384
Calm,384
Disgust,384
Fear,384
Happy,384
Neutral,192
Sad,384
Surprise,384


In [6]:
# Look at a random sample audio and get a feel for how the data is

import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["emotion"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), orig_sr=sr, target_sr=16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 1128
      Label: Neutral



In [7]:
def generate_train_test(df, save_path=""):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    if(save_path!=""):
        train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
        test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)
    
    return train_df, test_df


OUT_DIR_PATH = "/kaggle/working/"
data_path = os.path.join(OUT_DIR_PATH,"data")
os.makedirs(data_path, exist_ok=True)

train_df, test_df = generate_train_test(df, data_path)

print(train_df.shape)
print(test_df.shape)


(2304, 4)
(576, 4)


In [8]:
from datasets import load_dataset

# Loading data- this is loading data into HF Datasets. This differs from the previous cell where we load the input data into a pandas dataframe.
# The pandas dataframe helped us get the input data into a structured format and to generate training and test CSVs. We then load these CSVs 
# into HF dataset to work with models

data_files = {
    "train": os.path.join(data_path, "train.csv"),
    "validation": os.path.join(data_path, "test.csv"),
}

#Load data
dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

input_column = "path"
output_column = "emotion"

label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

A classification problem with 8 classes: ['Angry', 'Calm', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']


In [9]:
from transformers import AutoConfig, Wav2Vec2Processor

# In order to preprocess data, we need to get the model config and attributes from the model we want to use.
# Here we get the model config and the sampling rate used by the baseline model which has been trained for English ASR


model_id = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
pooling_mode = "mean" #Type of pooling to apply to the embeddings generated ath the output of the transformer module to collapse all the timesteps of the recordingsinto a single vector


# LOAD PRE-TRAINED MODEL ON ASR
# config
config = AutoConfig.from_pretrained(
    model_id, #path to the model of HuggingFace lib. that we will use as baseline to fine-tune.
    num_labels=num_labels, # num classes
    label2id={label: i for i, label in enumerate(label_list)}, # dict that maps emotions -> numbers
    id2label={i: label for i, label in enumerate(label_list)}, # dict that maps numbers -> emotions
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)


#Load the processor for the type of model (Wav2Vec2.0 in our case) and get the expected sampling rate (16kHZ in our case)
processor = Wav2Vec2Processor.from_pretrained(model_id, )
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The target sampling rate: 16000




In [10]:
# We now need to extract features from the input audio files for us to be able to train the model to detect emotions
# Define all funcctions needed for preprocessing

def speech_file_to_array_fn(path):
    speech, sampling_rate = torchaudio.load(path)
    speech = speech[0].numpy().squeeze()
    speech = librosa.resample(np.asarray(speech), orig_sr=sr, target_sr=target_sampling_rate)
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [11]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

Map (num_proc=4):   0%|          | 0/2304 [00:00<?, ? examples/s]

2024-08-21 01:04:35.992858: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 01:04:35.992858: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 01:04:35.992858: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 01:04:35.992858: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 01:04:35.992932: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory 

Map (num_proc=4):   0%|          | 0/576 [00:00<?, ? examples/s]

2024-08-21 01:05:13.116974: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 01:05:13.117052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-21 01:05:13.120240: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-21 01:05:13.123084: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 01:05:13.123141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

In [None]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['emotion']}")

In [13]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

MODEL********

In [14]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [15]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [16]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [17]:
#MODEL
print("Training model...")
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
is_regression = False

Training model...


In [18]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [19]:
# Loading pretrained checkpoint

model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_id,
    config=config,
)

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# This is setting the requires_grad=False for the feature extraction part as in the first few layers 
# of the model will not be finetuned because they are already good enough to understand audio signals

model.freeze_feature_extractor()

In [21]:
from transformers import TrainingArguments

model_path = os.path.join(OUT_DIR_PATH,"model")

training_args = TrainingArguments(
    output_dir=model_path,
    # output_dir="/content/gdrive/MyDrive/wav2vec2-xlsr-greek-speech-emotion-recognition"
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
)

2024-08-21 01:05:25.442124: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 01:05:25.442183: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-21 01:05:25.443587: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)


In [26]:
tokenizer_path = os.path.join(OUT_DIR_PATH,"tokenizer")
processor.feature_extractor.save_pretrained(tokenizer_path)

['/kaggle/working/tokenizer/preprocessor_config.json']

In [27]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
10,2.0932,2.057414,0.206597
20,2.0776,2.029046,0.203125
30,2.0172,2.034738,0.166667
40,1.9297,2.069156,0.203125
50,2.0734,1.908472,0.223958
60,1.8238,1.688324,0.407986
70,1.6871,1.583711,0.454861
80,1.4528,1.336201,0.512153
90,1.4886,1.436692,0.465278
100,1.4774,1.304058,0.536458


TrainOutput(global_step=288, training_loss=1.176349679629008, metrics={'train_runtime': 2437.6551, 'train_samples_per_second': 0.945, 'train_steps_per_second': 0.118, 'total_flos': 2.8421470544686848e+17, 'train_loss': 1.176349679629008, 'epoch': 1.0})

In [29]:
save_final_path = os.path.join(model_path,"finalModel")
save_final_path1 = os.path.join(model_path,"finalModelPretrained")
trainer.save_model(save_final_path)
# trainer.save_pretrained(save_final_path1)
trainer

<transformers.trainer.Trainer at 0x7a26389e22c0>

In [30]:
processor.feature_extractor.save_pretrained(save_final_path1)

['/kaggle/working/model/finalModelPretrained/preprocessor_config.json']

In [31]:
model.save_pretrained(save_final_path1)

**EVALUATION**

In [32]:
import librosa
from sklearn.metrics import classification_report
import os

In [33]:
OUT_DIR_PATH = "/kaggle/working/"
data_path = os.path.join(OUT_DIR_PATH,"data")

from datasets import load_dataset

test_path = os.path.join(data_path,"test.csv")
test_dataset = load_dataset("csv", data_files={"test": test_path}, delimiter="\t")["test"]
test_dataset

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['name', 'path', 'emotion', 'actor'],
    num_rows: 576
})

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [40]:
# model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
from tokenizers import Tokenizer

config = AutoConfig.from_pretrained(save_final_path)
# processor = AutoTokenizer.from_pretrained('/kaggle/working/model/finalModel/preprocessor_config.json')
processor = Wav2Vec2Processor.from_pretrained(model_id, )
model = Wav2Vec2ForSpeechClassification.from_pretrained(save_final_path).to(device)



In [41]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array[0].squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=processor.feature_extractor.sampling_rate)
#     resampler = torchaudio.transforms.Resample(sampling_rate)
#     speech_array = resampler(speech_array).squeeze().numpy()
    
    batch["speech"] = speech_array
#     print(type(batch["speech"]))
    return batch

def predict(batch):
    features = processor(batch["speech"], sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits 

    pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    batch["predicted"] = pred_ids
    return batch

In [42]:
test_dataset = test_dataset.map(speech_file_to_array_fn)

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

In [43]:
result = test_dataset.map(predict, batched=True, batch_size=8)



Map:   0%|          | 0/576 [00:00<?, ? examples/s]

In [44]:
label_names = [config.id2label[i] for i in range(config.num_labels)]
label_names

['Angry', 'Calm', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']

In [45]:
y_true = [config.label2id[name] for name in result["emotion"]]
y_pred = result["predicted"]

print(y_true[:5])
print(y_pred[:5])

[0, 2, 4, 0, 3]
[0, 2, 4, 0, 3]


In [46]:
print(classification_report(y_true, y_pred, target_names=label_names))

              precision    recall  f1-score   support

       Angry       0.92      0.92      0.92        76
        Calm       0.81      0.96      0.88        77
     Disgust       0.92      0.90      0.91        77
        Fear       0.87      0.96      0.91        77
       Happy       0.87      0.60      0.71        77
     Neutral       0.64      0.71      0.67        38
         Sad       0.93      0.74      0.83        77
    Surprise       0.81      0.97      0.88        77

    accuracy                           0.85       576
   macro avg       0.85      0.85      0.84       576
weighted avg       0.86      0.85      0.85       576



**PREDICTION**

In [47]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2Processor

import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd

sampling_rate= processor.feature_extractor.sampling_rate

In [48]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs


STYLES = """
<style>
div.display_data {
    margin: 0 auto;
    max-width: 500px;
}
table.xxx {
    margin: 50px !important;
    float: right !important;
    clear: both !important;
}
table.xxx td {
    min-width: 300px !important;
    text-align: center !important;
}
</style>
""".strip()

def prediction(df_row):
    path, emotion = df_row["path"], df_row["emotion"]
    df = pd.DataFrame([{"Emotion": emotion, "Sentence": "    "}])
    setup = {
        'border': 2,
        'show_dimensions': True,
        'justify': 'center',
        'classes': 'xxx',
        'escape': False,
    }
    ipd.display(ipd.HTML(STYLES + df.to_html(**setup) + "<br />"))
    speech, sr = torchaudio.load(path)
    speech = speech[0].numpy().squeeze()
    speech = librosa.resample(np.asarray(speech), orig_sr=sr, target_sr=sampling_rate)
    ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=sampling_rate))

    outputs = predict(path, sampling_rate)
    r = pd.DataFrame(outputs)
    ipd.display(ipd.HTML(STYLES + r.to_html(**setup) + "<br />"))

In [49]:
test = pd.read_csv(test_path, sep="\t")
test.head()

Unnamed: 0,name,path,emotion,actor
0,03-01-05-01-02-02-24,/kaggle/input/ravdess-emotional-speech-audio/A...,Angry,24
1,03-01-07-02-02-02-17,/kaggle/input/ravdess-emotional-speech-audio/a...,Disgust,17
2,03-01-03-01-01-02-07,/kaggle/input/ravdess-emotional-speech-audio/a...,Happy,7
3,03-01-05-01-02-01-08,/kaggle/input/ravdess-emotional-speech-audio/a...,Angry,8
4,03-01-06-02-02-02-11,/kaggle/input/ravdess-emotional-speech-audio/A...,Fear,11


In [50]:
prediction(test.iloc[0])

Unnamed: 0,Emotion,Sentence
0,Angry,


Unnamed: 0,Emotion,Score
0,Angry,89.5%
1,Calm,0.3%
2,Disgust,1.5%
3,Fear,0.7%
4,Happy,6.2%
5,Neutral,0.7%
6,Sad,0.2%
7,Surprise,0.7%


In [51]:
prediction(test.iloc[1])

Unnamed: 0,Emotion,Sentence
0,Disgust,


Unnamed: 0,Emotion,Score
0,Angry,2.4%
1,Calm,0.8%
2,Disgust,95.8%
3,Fear,0.0%
4,Happy,0.3%
5,Neutral,0.1%
6,Sad,0.5%
7,Surprise,0.1%


In [52]:
prediction(test.iloc[4])

Unnamed: 0,Emotion,Sentence
0,Fear,


Unnamed: 0,Emotion,Score
0,Angry,2.4%
1,Calm,0.1%
2,Disgust,0.2%
3,Fear,86.7%
4,Happy,6.8%
5,Neutral,0.2%
6,Sad,2.3%
7,Surprise,1.2%


In [62]:
model_name = 'Amulya/wav2vec2-xlsr-english-speech-emotion-recognition'
config_new = AutoConfig.from_pretrained(model_name)
# processor_new = Wav2Vec2Processor.from_pretrained(model_name)
model_new = Wav2Vec2ForSpeechClassification.from_pretrained(model_name).to(device)

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

In [65]:
from transformers import Wav2Vec2FeatureExtractor

processor_new = Wav2Vec2FeatureExtractor.from_pretrained(model_name)