# Install dependencies

In [None]:
!pip install datasets[audio]
!pip install git+https://github.com/huggingface/transformers
!pip install evaluate
!pip install fsspec==2023.9.2

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-_p0gcd5x
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-_p0gcd5x
  Resolved https://github.com/huggingface/transformers to commit 43d3b1931a7d3cddac9947adcb19bb3b1f8abedb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


# Load dataset


In [None]:
from datasets import load_dataset

ravdess = load_dataset("amnesiackid/ravdess-emotion-intensity")
ravdess

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion_labels', 'intensity'],
        num_rows: 1440
    })
})

In [None]:
from datasets import Audio
# resample the audio to 16000hz, as hubert requires
ravdess = ravdess.cast_column("audio", Audio(sampling_rate=16000))


In the dataset, labels are strings, such as "angry", we need to map them to machine readable numerical values. For convinience, this id-label mapping inherit the original id-label mapping in Ravedess dataset.

In [None]:
id2label = {
    0: "neutral",
    1: "calm",
    2: "happy",
    3: "sad",
    4: "angry",
    5: "fearful",
    6: "disgust",
    7: "surprised",
}
label2id = {v: k for k, v in id2label.items()}
def numerize_label(examples):
  examples["emotion_labels"] = label2id[examples["emotion_labels"]]
  return examples

ravdess = ravdess.map(numerize_label)
ravdess

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion_labels', 'intensity'],
        num_rows: 1440
    })
})

# Generate train / evaluation split


In [None]:
ravdess = ravdess["train"].train_test_split(seed=42, shuffle=True, test_size=0.2)
ravdess

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion_labels', 'intensity'],
        num_rows: 1152
    })
    test: Dataset({
        features: ['audio', 'emotion_labels', 'intensity'],
        num_rows: 288
    })
})

# Preprocess

In [None]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [None]:
import numpy as np
max_duration = 4.5
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        return_tensors="np",
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [None]:
ravdess_encoded = ravdess.map(
    preprocess_function,
    remove_columns=["audio", "intensity"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
ravdess_encoded

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['emotion_labels', 'input_values', 'attention_mask'],
        num_rows: 1152
    })
    test: Dataset({
        features: ['emotion_labels', 'input_values', 'attention_mask'],
        num_rows: 288
    })
})

In [None]:
ravdess_encoded = ravdess_encoded.rename_column("emotion_labels", "label")


Check if columns are well-processed and the values are legal. The correct output should be:

 {'label': <class 'int'>, 'input_values': <class 'list'>, 'attention_mask': <class 'list'>}

In [None]:
sample = ravdess_encoded["train"][0]
print(sample.keys(), {k: type(v) for k,v in sample.items()})

dict_keys(['label', 'input_values', 'attention_mask']) {'label': <class 'int'>, 'input_values': <class 'list'>, 'attention_mask': <class 'list'>}


In [None]:
id2label = {
    0: "neutral",
    1: "calm",
    2: "happy",
    3: "sad",
    4: "angry",
    5: "fearful",
    6: "disgust",
    7: "surprised",
}
label2id = {v: k for k, v in id2label.items()}
id2label[7] # check if mapping is correct


'surprised'

# Finetune pretrained model


In [None]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 16

training_args = TrainingArguments(
    f"{model_name}-finetuned-ravdess",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script: 0.00B [00:00, ?B/s]

To share the model, you need to log in your hugging face account, you can also skip this step and set push_to_hub in training_args to False.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=ravdess_encoded["train"],
    eval_dataset=ravdess_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9773,1.923018,0.326389
2,1.4414,1.497025,0.503472
3,1.1158,1.185425,0.5625
4,0.9079,0.757831,0.784722
5,0.4068,0.619608,0.784722
6,0.2004,0.630255,0.788194
7,0.0515,0.490312,0.840278
8,0.0183,0.520624,0.833333
9,0.0086,0.588227,0.836806
10,0.0054,0.530053,0.850694


TrainOutput(global_step=2304, training_loss=0.4298569751489494, metrics={'train_runtime': 1402.7747, 'train_samples_per_second': 13.14, 'train_steps_per_second': 1.642, 'total_flos': 1.754537446330107e+17, 'train_loss': 0.4298569751489494, 'epoch': 16.0})

#

# Apply this model in a pipeline


In [None]:
from transformers import pipeline
classifier = pipeline("audio-classification", "amnesiackid/distilhubert-finetuned-ravdess")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
audio = "/content/1001_DFA_SAD_XX.wav" # replace with your test audio path
result = classifier(audio)
result

[{'score': 0.8490888476371765, 'label': 'sad'},
 {'score': 0.11445887386798859, 'label': 'disgust'},
 {'score': 0.016835223883390427, 'label': 'fearful'},
 {'score': 0.014721513725817204, 'label': 'surprised'},
 {'score': 0.003034199122339487, 'label': 'happy'},
 {'score': 0.0011882490944117308, 'label': 'neutral'},
 {'score': 0.0005517051904462278, 'label': 'calm'},
 {'score': 0.00012133312702644616, 'label': 'angry'}]