In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import sys
import gc

In [2]:
train_data = []

for path in tqdm(Path("data/combined_normalized_16khz/").glob("**/*.wav")):
    name = str(path).split(os.path.sep)[-1].split('.')[0]
    label = str(path).split(os.path.sep)[-2]
    
    try:
        # There are some broken files
        s = torchaudio.load(path)
        train_data.append({
            "name": name,
            "path": path,
            "emotion": label
        })
    except Exception as e:
        print(str(path), e)
        pass

    # break

961it [00:00, 2900.68it/s]


In [3]:
val_data = []

for path in tqdm(Path("data/TIL_ans/").glob("**/*.wav")):
    name = str(path).split(os.path.sep)[-1].split('.')[0]
    label = str(path).split(os.path.sep)[-2]

    try:
        # There are some broken files
        s = torchaudio.load(path)
        val_data.append({
            "name": name,
            "path": path,
            "emotion": label
        })
    except Exception as e:
        print(str(path), e)
        pass

    # break

600it [00:00, 2133.29it/s]


In [4]:
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)

In [5]:
# Filter broken and non-existed paths

print(f"Step 0: {len(train_df)}")

train_df["status"] = train_df["path"].apply(lambda path: True if os.path.exists(path) else None)
train_df = train_df.dropna(subset=["path"])
train_df = train_df.drop("status", 1)
print(f"Step 1: {len(train_df)}")

train_df = train_df.sample(frac=1)
train_df = train_df.reset_index(drop=True)
train_df.head()

print(f"Step 0: {len(val_df)}")

val_df["status"] = val_df["path"].apply(lambda path: True if os.path.exists(path) else None)
val_df = val_df.dropna(subset=["path"])
val_df = val_df.drop("status", 1)
print(f"Step 1: {len(val_df)}")

val_df = val_df.sample(frac=1)
val_df = val_df.reset_index(drop=True)
val_df.head()

Step 0: 961
Step 1: 961
Step 0: 600
Step 1: 600


  train_df = train_df.drop("status", 1)
  val_df = val_df.drop("status", 1)


Unnamed: 0,name,path,emotion
0,807b4e2e04,data\TIL_ans\sad\807b4e2e04.wav,sad
1,14636ee568,data\TIL_ans\neutral\14636ee568.wav,neutral
2,8f297e8c60,data\TIL_ans\angry\8f297e8c60.wav,angry
3,bb361bb999,data\TIL_ans\fear\bb361bb999.wav,fear
4,8551c019da,data\TIL_ans\angry\8551c019da.wav,angry


For training purposes, we need to split data into train test sets; in this specific example, we break with a `20%` rate for the test set.

In [6]:
save_path = "data/wav2vec2temp"

# train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=101, stratify=train_df["emotion"])

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
val_df.to_csv(f"{save_path}/valid.csv", sep="\t", encoding="utf-8", index=False)

print(train_df.shape)
print(val_df.shape)

(961, 3)
(600, 3)


## Prepare Data for Training

In [7]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": os.path.join(save_path, 'train.csv'),
    "validation": os.path.join(save_path, 'valid.csv'),
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Using custom data configuration default-b2a2c85353c6b66f


Downloading and preparing dataset csv/default to C:\Users\alien\.cache\huggingface\datasets\csv\default-b2a2c85353c6b66f\0.0.0\51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:\Users\alien\.cache\huggingface\datasets\csv\default-b2a2c85353c6b66f\0.0.0\51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 961
})
Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 600
})


In [8]:
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 5 classes: ['angry', 'fear', 'happy', 'neutral', 'sad']


In [9]:
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

model_name_or_path = "wav2vec2-large-xlsr-53/"
pooling_mode = "mean"
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)


processor = Wav2Vec2FeatureExtractor.from_pretrained('wav2vec2-large-xlsr-53/')
target_sampling_rate = processor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


# Preprocess Data

So far, we downloaded, loaded, and split the SER dataset into train and test sets. The instantiated our strategy configuration for using context representations in our classification problem SER. Now, we need to extract features from the audio path in context representation tensors and feed them into our classification model to determine the emotion in the speech.

Since the audio file is saved in the `.wav` format, it is easy to use **[Librosa](https://librosa.org/doc/latest/index.html)** or others, but we suppose that the format may be in the `.mp3` format in case of generality. We found that the **[Torchaudio](https://pytorch.org/audio/stable/index.html)** library works best for reading in `.mp3` data.

An audio file usually stores both its values and the sampling rate with which the speech signal was digitalized. We want to store both in the dataset and write a **map(...)** function accordingly. Also, we need to handle the string labels into integers for our specific classification task in this case, the **single-label classification** you may want to use for your **regression** or even **multi-label classification**.

In [10]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result


train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=128,
    batched=True,
    num_proc=1
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=128,
    batched=True,
    num_proc=1
)



  0%|          | 0/8 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/5 [00:00<?, ?ba/s]

In [11]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [12]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


## Training

The data is processed so that we are ready to start setting up the training pipeline. We will make use of 🤗's [Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer) for which we essentially need to do the following:

- Define a data collator. In contrast to most NLP models, XLSR-Wav2Vec2 has a much larger input length than output length. *E.g.*, a sample of input length 50000 has an output length of no more than 100. Given the large input sizes, it is much more efficient to pad the training batches dynamically meaning that all training samples should only be padded to the longest sample in their batch and not the overall longest sample. Therefore, fine-tuning XLSR-Wav2Vec2 requires a special padding data collator, which we will define below

- Evaluation metric. During training, the model should be evaluated on the word error rate. We should define a `compute_metrics` function accordingly

- Load a pretrained checkpoint. We need to load a pretrained checkpoint and configure it correctly for training.

- Define the training configuration.

After having fine-tuned the model, we will correctly evaluate it on the test data and verify that it has indeed learned to correctly transcribe speech.

### Set-up Trainer

Let's start by defining the data collator. The code for the data collator was copied from [this example](https://github.com/huggingface/transformers/blob/9a06b6b11bdfc42eea08fa91d0c737d1863c99e3/examples/research_projects/wav2vec2/run_asr.py#L81).

Without going into too many details, in contrast to the common data collators, this data collator treats the `input_values` and `labels` differently and thus applies to separate padding functions on them (again making use of XLSR-Wav2Vec2's context manager). This is necessary because in speech input and output are of different modalities meaning that they should not be treated by the same padding function.
Analogous to the common data collators, the padding tokens in the labels with `-100` so that those tokens are **not** taken into account when computing the loss.

In [13]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [14]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Next, the evaluation metric is defined. There are many pre-defined metrics for classification/regression problems, but in this case, we would continue with just **Accuracy** for classification and **MSE** for regression. You can define other metrics on your own.

In [15]:
is_regression = False

In [16]:
import numpy as np
from transformers import EvalPrediction
from sklearn.metrics import f1_score

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        result =  {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item(),
                "f1score": f1_score(p.label_ids,preds,average = 'weighted')}#f1_score(y_true, y_pred)
        print(result)
        return result

Now, we can load the pretrained XLSR-Wav2Vec2 checkpoint into our classification model with a pooling strategy.

In [17]:
model_name_or_path = "wav2vec2-large-xlsr-53/"
model_name_or_path = "models/10epoch finetune on egg 7028/" #Continue training

In [18]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
    final_dropout=0.2,
)
setattr(config, 'pooling_mode', pooling_mode)

In [19]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

The first component of XLSR-Wav2Vec2 consists of a stack of CNN layers that are used to extract acoustically meaningful - but contextually independent - features from the raw speech signal. This part of the model has already been sufficiently trained during pretraining and as stated in the [paper](https://arxiv.org/pdf/2006.13979.pdf) does not need to be fine-tuned anymore. 
Thus, we can set the `requires_grad` to `False` for all parameters of the *feature extraction* part.

In [20]:
model.freeze_feature_extractor()

In [None]:
# from google.colab import drive

# drive.mount('/gdrive')

In [None]:
#!mv "/content/gdrive/MyDrive/2nd-epoch-jonatasgrosman-wav2vec2-large-xlsr-53" "/content/drive/MyDrive/2nd-epoch-jonatasgrosman-wav2vec2-large-xlsr-53"

In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="models/wav2vec2",
    overwrite_output_dir =True,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=4, #Originally 4
    gradient_accumulation_steps=3,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    num_train_epochs=10,
    fp16=True,
    save_steps=1,
    eval_steps=1,
    logging_steps=10,
    learning_rate=2e-5, # 1e-4 for 10 then 2e-5
    # lr_scheduler_type = 'cosine',
    save_total_limit=2,
    load_best_model_at_end =True,
    metric_for_best_model ='f1score',
    greater_is_better =True,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


For future use we can create our training script, we do it in a simple way. You can add more on you own.

In [22]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()


Now, all instances can be passed to Trainer and we are ready to start training!

In [23]:
# opt = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
#opt = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='max', factor=0.1, patience=10, verbose=True)
# Need convert above to lamdba lr
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    #optimizers=opt
)


Using amp half precision backend


### Training

Training will take between 10 and 60 minutes depending on the GPU allocated to this notebook. 

In case you want to use this google colab to fine-tune your model, you should make sure that your training doesn't stop due to inactivity. A simple hack to prevent this is to paste the following code into the console of this tab (right mouse click -> inspect -> Console tab and insert code).

```javascript
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
```

In [24]:
gc.collect()

99

In [25]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 961
  Num Epochs = 10
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 18
  Gradient Accumulation steps = 3
  Total optimization steps = 530


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7250000238418579, 'f1score': 0.7235702412373151}


Saving model checkpoint to models/wav2vec2\checkpoint-53
Configuration saved in models/wav2vec2\checkpoint-53\config.json
Model weights saved in models/wav2vec2\checkpoint-53\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-53\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-265] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7049999833106995, 'f1score': 0.7032682826341176}


Saving model checkpoint to models/wav2vec2\checkpoint-106
Configuration saved in models/wav2vec2\checkpoint-106\config.json
Model weights saved in models/wav2vec2\checkpoint-106\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-106\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-530] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7166666388511658, 'f1score': 0.7143317032672644}


Saving model checkpoint to models/wav2vec2\checkpoint-159
Configuration saved in models/wav2vec2\checkpoint-159\config.json
Model weights saved in models/wav2vec2\checkpoint-159\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-159\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-106] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.721666693687439, 'f1score': 0.7201835133420705}


Saving model checkpoint to models/wav2vec2\checkpoint-212
Configuration saved in models/wav2vec2\checkpoint-212\config.json
Model weights saved in models/wav2vec2\checkpoint-212\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-212\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-159] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7133333086967468, 'f1score': 0.7112511737980914}


Saving model checkpoint to models/wav2vec2\checkpoint-265
Configuration saved in models/wav2vec2\checkpoint-265\config.json
Model weights saved in models/wav2vec2\checkpoint-265\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-265\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-212] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.70333331823349, 'f1score': 0.7005090698854535}


Saving model checkpoint to models/wav2vec2\checkpoint-318
Configuration saved in models/wav2vec2\checkpoint-318\config.json
Model weights saved in models/wav2vec2\checkpoint-318\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-318\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-265] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7200000286102295, 'f1score': 0.7176406998152753}


Saving model checkpoint to models/wav2vec2\checkpoint-371
Configuration saved in models/wav2vec2\checkpoint-371\config.json
Model weights saved in models/wav2vec2\checkpoint-371\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-371\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-318] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7049999833106995, 'f1score': 0.7019336002509569}


Saving model checkpoint to models/wav2vec2\checkpoint-424
Configuration saved in models/wav2vec2\checkpoint-424\config.json
Model weights saved in models/wav2vec2\checkpoint-424\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-424\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-371] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7099999785423279, 'f1score': 0.7075555280206409}


Saving model checkpoint to models/wav2vec2\checkpoint-477
Configuration saved in models/wav2vec2\checkpoint-477\config.json
Model weights saved in models/wav2vec2\checkpoint-477\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-477\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-424] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: emotion, path, name. If emotion, path, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4


{'accuracy': 0.7016666531562805, 'f1score': 0.698958182395587}


Saving model checkpoint to models/wav2vec2\checkpoint-530
Configuration saved in models/wav2vec2\checkpoint-530\config.json
Model weights saved in models/wav2vec2\checkpoint-530\pytorch_model.bin
Feature extractor saved in models/wav2vec2\checkpoint-530\preprocessor_config.json
Deleting older checkpoint [models\wav2vec2\checkpoint-477] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from models/wav2vec2\checkpoint-53 (score: 0.7235702412373151).


TrainOutput(global_step=530, training_loss=0.03381567717231107, metrics={'train_runtime': 815.7956, 'train_samples_per_second': 11.78, 'train_steps_per_second': 0.65, 'total_flos': 9.953736170101179e+17, 'train_loss': 0.03381567717231107, 'epoch': 9.99})

In [27]:
import gc
gc.collect()
torch.cuda.empty_cache()

# Prediction

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd

In [27]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_name_or_path = 'models/16_epoch_9799'
# config = AutoConfig.from_pretrained(model_name_or_path)
processor = Wav2Vec2FeatureExtractor.from_pretrained("wav2vec2-large-xlsr-53")
sampling_rate = processor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained('models/20epoch finetuned on egg 6989').to(device)
#model2 = Wav2Vec2ForSpeechClassification.from_pretrained('models/21epoch combined 7005 on val real7115').to(device)

loading feature extractor configuration file wav2vec2-large-xlsr-53\preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

loading configuration file models/20epoch finetuned on egg 6989\config.json
Model config Wav2Vec2Config {
  "_name_or_path": "models/10epoch finetune on egg 7028/",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForSpeechClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    

In [36]:
sampling_rate = 16000
device = 'cuda'

In [37]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(model, path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    #outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    outputs = scores
    return outputs


STYLES = """
<style>
div.display_data {
    margin: 0 auto;
    max-width: 500px;
}
table.xxx {
    margin: 50px !important;
    float: right !important;
    clear: both !important;
}
table.xxx td {
    min-width: 300px !important;
    text-align: center !important;
}
</style>
""".strip()

def prediction(model, df_row):
    path = df_row["path"]
    outputs = predict(model, path, sampling_rate)
    return outputs


In [38]:
import os
fns = os.listdir("data/TIL_test_normalized/")
fns.sort()
test = pd.DataFrame()
test['name'] = [x.split('.')[0] for x in fns]
test['path'] = ["data/TIL_test_normalized/"+x for x in fns]
test['emotion'] = ['unknown' for x in fns]
emos = ['angry', 'fear', 'happy', 'neutral', 'sad']

In [39]:
from tqdm import trange
pred1 = np.array([prediction(model, test.iloc[i]) for i in trange(len(test))])
# pred2 = np.array([prediction(model2, test.iloc[i]) for i in trange(len(test))])

100%|██████████| 600/600 [00:16<00:00, 37.31it/s]


In [40]:
#pred = 0.5 * pred1 + 0.5 * pred2
pred = pred1
test['emotion'] = [emos[np.argmax(pred[i])] for i in trange(len(pred))]

100%|██████████| 600/600 [00:00<00:00, 599328.98it/s]


In [41]:
test['filename'] = test['name']+'.wav'

In [42]:
test[['filename','emotion']].to_csv("wav2vec2 20epoch finetuned on egg 6989.csv", index=False, header=False)