# Fine-tune HuBERT for Emotion Recognition on nEMO
This notebook uses Hugging Face Transformers to fine-tune a HuBERT model for speech emotion recognition on the nEMO dataset.

In [1]:
import numpy as np
import torch
from datasets import load_dataset, Audio
from transformers import (
    Wav2Vec2FeatureExtractor,
    HubertForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import evaluate
from sklearn.metrics import accuracy_score, f1_score

In [2]:
model_name_or_path = 'facebook/hubert-base-ls960'
dataset_name = 'amu-cai/nEMO'
output_dir = './hubert-nemo-emotion'
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
learning_rate = 3e-5
num_train_epochs = 5
eval_steps = 200
save_steps = 200

In [3]:
def prepare_dataset(batch):
    audio = batch['audio']['array']
    features = feature_extractor(
        audio,
        sampling_rate=16_000,
        return_attention_mask=True,
        return_tensors='pt'
    )
    batch['input_values'] = features.input_values[0].numpy().tolist()
    batch['attention_mask'] = features.attention_mask[0].numpy().tolist()
    batch['labels'] = label2id[batch['emotion']]
    return batch

In [4]:
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=-1)
    acc = accuracy_score(pred.label_ids, preds)
    f1 = f1_score(pred.label_ids, preds, average='macro')
    return {'accuracy': acc, 'f1_macro': f1}

In [5]:
raw = load_dataset(dataset_name, split='train')
ds = raw.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = ds['train'], ds['test']
train_ds = train_ds.cast_column('audio', Audio(sampling_rate=16_000))
eval_ds = eval_ds.cast_column('audio', Audio(sampling_rate=16_000))

In [6]:

unique_emotions = sorted(set(train_ds['emotion']))
label2id = {emo: i for i, emo in enumerate(unique_emotions)}
id2label = {i: emo for emo, i in label2id.items()}

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    model_name_or_path,
    sampling_rate=16_000,
    return_attention_mask=True,
)
model = HubertForSequenceClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(unique_emotions),
    label2id=label2id,
    id2label=id2label,
    problem_type='single_label_classification',
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_ds = train_ds.map(
    prepare_dataset,
    remove_columns=train_ds.column_names
)
eval_ds = eval_ds.map(
    prepare_dataset,
    remove_columns=eval_ds.column_names
)

In [8]:
data_collator = DataCollatorWithPadding(feature_extractor, padding=True)
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    eval_strategy='steps',
    eval_steps=eval_steps,
    save_steps=save_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    logging_steps=10000,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
    report_to="none"
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(output_dir)

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro
200,No log,1.260865,0.492205,0.437525
400,No log,0.945259,0.628062,0.568439
600,No log,0.639579,0.77951,0.772459
800,No log,0.374178,0.893096,0.889634


RuntimeError: [enforce fail at inline_container.cc:659] . unexpected pos 527565056 vs 527564948