# Fine-tune HuBERT for Emotion Recognition on nEMO
This notebook uses Hugging Face Transformers to fine-tune a HuBERT model for speech emotion recognition on the nEMO dataset.

In [4]:
import numpy as np
import torch
from datasets import load_dataset, Audio
from transformers import (
    Wav2Vec2FeatureExtractor,
    HubertForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import evaluate
from sklearn.metrics import accuracy_score, f1_score

In [5]:
model_name_or_path = 'facebook/hubert-base-ls960'
dataset_name = 'amu-cai/nEMO'
output_dir = './hubert-nemo-emotion'
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
learning_rate = 3e-5
num_train_epochs = 5
eval_steps = 200
save_steps = 200

In [6]:
def prepare_dataset(batch):
    audio = batch['audio']['array']
    features = feature_extractor(
        audio,
        sampling_rate=16_000,
        return_attention_mask=True,
        return_tensors='pt'
    )
    batch['input_values'] = features.input_values[0].numpy().tolist()
    batch['attention_mask'] = features.attention_mask[0].numpy().tolist()
    batch['labels'] = label2id[batch['emotion']]
    return batch

In [7]:
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=-1)
    acc = accuracy_score(pred.label_ids, preds)
    f1 = f1_score(pred.label_ids, preds, average='macro')
    return {'accuracy': acc, 'f1_macro': f1}

In [8]:
raw = load_dataset(dataset_name, split='train')
ds = raw.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = ds['train'], ds['test']
train_ds = train_ds.cast_column('audio', Audio(sampling_rate=16_000))
eval_ds = eval_ds.cast_column('audio', Audio(sampling_rate=16_000))

In [9]:

unique_emotions = sorted(set(train_ds['emotion']))
label2id = {emo: i for i, emo in enumerate(unique_emotions)}
id2label = {i: emo for emo, i in label2id.items()}

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    model_name_or_path,
    sampling_rate=16_000,
    return_attention_mask=True,
)
model = HubertForSequenceClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(unique_emotions),
    label2id=label2id,
    id2label=id2label,
    problem_type='single_label_classification',
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_ds = train_ds.map(
    prepare_dataset,
    remove_columns=train_ds.column_names,
    num_proc=4,
)
eval_ds = eval_ds.map(
    prepare_dataset,
    remove_columns=eval_ds.column_names,
    num_proc=4,
)

In [11]:
data_collator = DataCollatorWithPadding(feature_extractor, padding=True)
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    eval_strategy='steps',
    eval_steps=eval_steps,
    save_steps=save_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(output_dir)

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madasta146[0m ([33madasta146-agh-university-of-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1 Macro
200,1.4524,1.367296,0.463252,0.368158
400,1.0516,0.89752,0.668151,0.634232
600,0.7901,0.732244,0.759465,0.734412
800,0.5977,0.567662,0.786192,0.772072
1000,0.4671,0.614862,0.81069,0.799238
1200,0.3831,0.432717,0.88196,0.877537
1400,0.2921,0.331783,0.917595,0.914685
1600,0.2653,0.610934,0.868597,0.854757
1800,0.2583,0.333507,0.926503,0.923422


[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-200)... Done. 4.7s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-400)... Done. 3.8s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-600)... Done. 13.7s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-800)... Done. 15.6s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-1000)... Done. 12.1s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-1200)... Done. 9.6s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-1400)... Done. 15.7s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-1600)... Done. 22.9s
[34m[1mwandb[0m: Adding directory to artifact (./hubert-nemo-emotion/checkpoint-1800)... 