In [None]:
%%capture
%pip install librosa
%pip install jiwer
%pip install git+https://github.com/huggingface/datasets.git


In [None]:
%%capture
%pip install transformers[torch]
%pip install accelerate

In [None]:
import os
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

dataset_dir = "/gdrive/MyDrive/USC Course Work/csci 535/project/Audio_Speech_Actors_01-24"

In [None]:
# Filename example: 02-01-06-01-02-01-12.mp4

# Video-only (02)
# Speech (01)
# Fearful (06)
# Normal intensity (01)
# Statement "dogs" (02)
# 1st Repetition (01)
# 12th Actor (12)
# Female, as the actor ID number is even.

# 01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
emotion_dict = {'01': "neutral",
                '02': "calm",
                '03': "happy",
                '04': "sad",
                '05': "angry",
                '06': "fearful",
                '07': "disgust",
                '08': "surprised",
                }
def isEven(num):
  if (num % 2) == 0:
    return True
  return False

emotion_list = []
emotion_intensity_list = []
actor_no_list = []
actor_gender_list = []
aud_path_list = []

from posixpath import dirname
for (dirpath, dirnames, filenames) in os.walk(dataset_dir):
  # print(dirpath)
  # print("*" * 80)
  # for name_dir in dirnames:
    # print(name_dir)
  # print("^" * 80)
  for name_file in filenames:
    # print(name_file, os.path.join(dirpath, name_file))

    split_list = name_file.split(".")[0].split("-")
    emotion_list.append(emotion_dict[split_list[2]])
    emotion_intensity_list.append(int(split_list[3]))
    actor_no_list.append(int(split_list[-1]))
    actor_gender_list.append("Female" if int(split_list[-1]) % 2 == 0 else "Male")
    aud_path_list.append(os.path.join(dirpath, name_file))


In [None]:
df = pd.DataFrame({
    "name" : [i.split("/")[-1] for i in aud_path_list],
    "path" : aud_path_list,
    "emotion" : emotion_list
})

In [None]:
df

Unnamed: 0,name,path,emotion
0,03-01-01-01-02-01-01.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,neutral
1,03-01-01-01-02-02-01.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,neutral
2,03-01-02-01-01-02-01.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,calm
3,03-01-01-01-01-02-01.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,neutral
4,03-01-02-01-02-01-01.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,calm
...,...,...,...
1435,03-01-08-02-01-02-24.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,surprised
1436,03-01-08-02-01-01-24.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,surprised
1437,03-01-08-01-02-01-24.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,surprised
1438,03-01-08-02-02-01-24.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,surprised


In [None]:
# Filter broken and non-existed paths

print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["path"])
df = df.drop("status", axis= 1)
print(f"Step 1: {len(df)}")

df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

Step 0: 1440
Step 1: 1440


Unnamed: 0,name,path,emotion
0,03-01-01-01-01-01-06.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,neutral
1,03-01-04-01-01-01-08.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,sad
2,03-01-05-01-01-01-18.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,angry
3,03-01-05-01-01-02-06.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,angry
4,03-01-01-01-02-02-19.wav,/gdrive/MyDrive/USC Course Work/csci 535/proje...,neutral


In [None]:
df["path"][0]

'/gdrive/MyDrive/USC Course Work/csci 535/project/Audio_Speech_Actors_01-24/Actor_06/03-01-01-01-01-01-06.wav'

In [None]:
print("Labels: ", df["emotion"].unique())
print()
df.groupby("emotion").count()[["path"]]

Labels:  ['neutral' 'sad' 'angry' 'disgust' 'calm' 'fearful' 'happy' 'surprised']



Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
angry,192
calm,192
disgust,192
fearful,192
happy,192
neutral,96
sad,192
surprised,192


In [None]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["emotion"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), orig_sr= sr, target_sr= 16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 313
      Label: disgust



In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

In [None]:
# save_path = "/content/data"
save_path = "/gdrive/MyDrive/CS535_preprocessing/wav_csv"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train_ravdess.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test_ravdess.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

(1152, 3)
(288, 3)


In [None]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric

save_path = "/content/drive/MyDrive/CS535_preprocessing/wav_csv"
# data_files = {
#     "train": "/content/data/train.csv",
#     "validation": "/content/data/test.csv",
# }
data_files = {
    # "train": os.path.join(save_path, "train_ravdess.csv"),
    "train": os.path.join(save_path, "train.csv"),
    # "validation": os.path.join(save_path, "test_ravdess.csv"),
    "validation": os.path.join(save_path, "test.csv"),
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 1672
})
Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 419
})


In [None]:
train_dataset = train_dataset.select(list(range(1000)))

In [None]:
print(type(train_dataset))
print(type(eval_dataset))

<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


In [None]:
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"

In [None]:
# load dataset
from datasets import load_from_disk
train_dataset = load_from_disk("/content/drive/MyDrive/CS535_preprocessing/wav_csv/ravdess_train")
eval_dataset = load_from_disk("/content/drive/MyDrive/CS535_preprocessing/wav_csv/ravdess_eval")

In [None]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 8 classes: ['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']


In [None]:
from transformers import AutoConfig, Wav2Vec2Processor

In [None]:
# model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
pooling_mode = "mean"

In [None]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [None]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [None]:
import librosa
def speech_file_to_array_fn(path):
    # speech_array, sampling_rate = torchaudio.load(path)
    speech_array, sampling_rate = librosa.load(path)
    # resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    # speech = resampler(speech_array).squeeze().numpy()
    speech = librosa.resample(speech_array, orig_sr=sampling_rate, target_sr=target_sampling_rate).squeeze()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]
    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/419 [00:00<?, ? examples/s]

In [None]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['emotion']}")

Training input_values: [0.010326852090656757, 0.057940274477005005, 0.05968312546610832, 0.06341341882944107, 0.05729398876428604, 0.019035320729017258, 0.05394413694739342, 0.01563604362308979, 0.05146205052733421, 0.03857209533452988, -0.018696758896112442, 0.004191771615296602, -0.05246594920754433, -0.06705748289823532, -0.08463971316814423, -0.0624658428132534, -0.053845152258872986, -0.0048140487633645535, 0.02057669125497341, 0.019088732078671455, 0.08535435795783997, 0.027651919052004814, 0.04920604079961777, 0.053984276950359344, 0.06340887397527695, 0.08878695964813232, 0.03933245688676834, 0.012304781936109066, -0.0065348949283361435, 0.01778237521648407, 0.005363897420465946, 0.018341222777962685, 2.152684646716807e-05, 0.010029944591224194, 0.05101202055811882, 0.014824953861534595, 0.02118782326579094, 0.048021409660577774, 0.03736807778477669, 0.01687720976769924, -0.029318565502762794, -0.08722812682390213, -0.10578607767820358, -0.07765902578830719, -0.0845039561390876

In [None]:
train_dataset.save_to_disk("/gdrive/MyDrive/CS535_preprocessing/wav_csv/ravdess_train")
eval_dataset.save_to_disk("/gdrive/MyDrive/CS535_preprocessing/wav_csv/ravdess_eval")

Saving the dataset (0/2 shards):   0%|          | 0/1152 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/288 [00:00<?, ? examples/s]

In [None]:
train_dataset.save_to_disk("/content/drive/MyDrive/CS535_preprocessing/wav_csv/mosi_train")
eval_dataset.save_to_disk("/content/drive/MyDrive/CS535_preprocessing/wav_csv/mosi_eval")

Saving the dataset (0/2 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/419 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# load dataset
from datasets import load_from_disk
train_dataset = load_from_disk("/gdrive/MyDrive/CS535_preprocessing/wav_csv/ravdess_train")
eval_dataset = load_from_disk("/gdrive/MyDrive/CS535_preprocessing/wav_csv/ravdess_eval")

In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [None]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
is_regression = False

In [None]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.freeze_feature_extractor()

In [None]:
from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="/content/wav2vec2-xlsr-ravedess-speech-emotion-recognition",
#     # output_dir="/content/gdrive/MyDrive/wav2vec2-xlsr-greek-speech-emotion-recognition"
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     gradient_accumulation_steps=2,
#     evaluation_strategy="steps",
#     num_train_epochs=1.0,
#     fp16= True,
#     # fp16= False,
#     save_steps=10,
#     eval_steps=10,
#     logging_steps=10,
#     learning_rate=1e-4,
#     save_total_limit=2,
# )

training_args = TrainingArguments(
    # output_dir="/content/wav2vec2-xlsr-mosi-speech-emotion-recognition",
    output_dir="/content/drive/MyDrive/CS535_preprocessing/wav_csv/wav2vec2-xlsr-mosi-speech-emotion-recognition",
    # output_dir="/content/gdrive/MyDrive/wav2vec2-xlsr-greek-speech-emotion-recognition"
    per_device_train_batch_size= 4,
    per_device_eval_batch_size= 4,
    seed= 42,
    # adam_beta1= 0.9,
    # adam_beta2= 0.999,
    # adam_epsilon= 1e-08,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=3.0,
    fp16= True,
    # fp16= False,
    save_steps=50,
    eval_steps=50,
    logging_steps=10,
    learning_rate=0.0001,
    save_total_limit=2,
)



In [None]:
!rm -rf /content/wav2vec2-xlsr-ravedess-speech-emotion-recognition/

In [None]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)
        self.use_amp = True
        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            print("first")
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            print("second")
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()

            # torch.cuda.amp.GradScaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            print("third")
            loss.backward()

        return loss.detach()


In [None]:
# trainer = CTCTrainer(
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
50,1.875,1.803257,0.340278
100,1.4232,1.433918,0.451389
150,1.3051,1.007143,0.628472
200,0.8181,0.976267,0.645833
250,0.8353,0.76653,0.722222
300,0.4441,0.607779,0.795139
350,0.5616,0.695995,0.743056
400,0.5488,0.521315,0.819444


TrainOutput(global_step=432, training_loss=0.9899182981914945, metrics={'train_runtime': 587.8363, 'train_samples_per_second': 5.879, 'train_steps_per_second': 0.735, 'total_flos': 4.282845847275744e+17, 'train_loss': 0.9899182981914945, 'epoch': 3.0})

In [None]:
trainer.save_model("/content/bruh")

In [None]:
# Evaluation

In [None]:
import librosa
from sklearn.metrics import classification_report

In [None]:
# test_dataset = load_dataset("csv", data_files={"test": "/content/data/test.csv"}, delimiter="\t")["test"]
test_dataset = load_from_disk("/gdrive/MyDrive/CS535_preprocessing/wav_csv/ravdess_eval")
test_dataset

Dataset({
    features: ['name', 'path', 'emotion', 'input_values', 'attention_mask', 'labels'],
    num_rows: 288
})

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [None]:
# model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
og_model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
# model_name_or_path = "/content/wav2vec2-xlsr-ravedess-speech-emotion-recognition"
model_name_or_path = "/content/wav2vec2-xlsr-ravedess-speech-emotion-recognition/checkpoint-400"


# og_model_name_or_path = model_name_or_path = "/content/bruh"
# config = AutoConfig.from_pretrained(model_name_or_path)

# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

# config = config
processor = Wav2Vec2Processor.from_pretrained(og_model_name_or_path)
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

In [None]:
import torchaudio
def speech_file_to_array_fn(batch):
    # speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array, sampling_rate = librosa.load(batch["path"])
    # speech_array = speech_array.squeeze().numpy()
    speech_array = speech_array
    # speech_array = librosa.resample(np.asarray(speech_array), orig_sr= sampling_rate, target_sr= processor.feature_extractor.sampling_rate)
    speech_array = librosa.resample(speech_array, orig_sr= sampling_rate, target_sr= processor.feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch


def predict(batch):
    features = processor(batch["speech"], sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    batch["predicted"] = pred_ids
    return batch

In [None]:
test_dataset = test_dataset.map(speech_file_to_array_fn)


In [None]:
result = test_dataset.map(predict, batched=True, batch_size=8)

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

In [None]:
label_names = [config.id2label[i] for i in range(config.num_labels)]
label_names

['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']

In [None]:
y_true = [config.label2id[name] for name in result["emotion"]]
y_pred = result["predicted"]

print(y_true[:5])
print(y_pred[:5])

[6, 0, 4, 2, 3]
[6, 0, 7, 2, 3]


In [None]:
print(classification_report(y_true, y_pred, target_names=label_names))

              precision    recall  f1-score   support

       angry       0.94      0.87      0.90        38
        calm       0.82      0.95      0.88        39
     disgust       0.83      0.92      0.88        38
     fearful       0.97      0.79      0.87        39
       happy       0.77      0.71      0.74        38
     neutral       0.70      0.84      0.76        19
         sad       0.82      0.74      0.78        38
   surprised       0.83      0.90      0.86        39

    accuracy                           0.84       288
   macro avg       0.84      0.84      0.83       288
weighted avg       0.85      0.84      0.84       288

