# 1. Install Dependencies (including PEFT for LoRA)

In [1]:
!pip install transformers datasets librosa soundfile evaluate accelerate peft kagglehub --quiet

import os
import glob
import numpy as np
import pandas as pd
import librosa
import torch
import kagglehub
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from transformers import (
    Wav2Vec2FeatureExtractor,
    HubertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, LoraConfig, TaskType

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# 2. Download Datasets


In [2]:
print("Downloading Datasets...")
ravdess_path = kagglehub.dataset_download("orvile/ravdess-dataset")
crema_path = kagglehub.dataset_download("ejlok1/cremad")
tess_path = kagglehub.dataset_download("ejlok1/toronto-emotional-speech-set-tess")

def get_combined_data(ravdess, crema, tess):
    paths = []
    emotions = []

    # 1. RAVDESS
    e_map_r = {"01": "neutral", "02": "neutral", "03": "happy", "04": "sad",
               "05": "angry", "06": "fearful", "07": "disgust", "08": "happy"}
    for p in glob.glob(os.path.join(ravdess, "**", "*.wav"), recursive=True):
        code = os.path.basename(p).split('-')[2]
        if code in e_map_r:
            paths.append(p)
            emotions.append(e_map_r[code])

    # 2. CREMA-D
    e_map_c = {"A": "angry", "D": "disgust", "F": "fearful", "H": "happy", "N": "neutral", "S": "sad"}
    for p in glob.glob(os.path.join(crema, "**", "*.wav"), recursive=True):
        code = os.path.basename(p).split('_')[2]
        if code[0] in e_map_c:
            paths.append(p)
            emotions.append(e_map_c[code[0]])

    # 3. TESS
    for p in glob.glob(os.path.join(tess, "**", "*.wav"), recursive=True):
        filename = os.path.basename(p)
        emotion = filename.split('_')[-1].replace('.wav', '').lower()
        if emotion == "ps": emotion = "happy" # pleasant surprise -> happy
        paths.append(p)
        emotions.append(emotion)

    df = pd.DataFrame({"path": paths, "emotion": emotions})
    df['emotion'] = df['emotion'].replace({
        "fear": "fearful",  # Merge TESS 'fear' into 'fearful'
        "sadness": "sad",   # Just in case TESS uses 'sadness'
        "happiness": "happy"
    })

    return df

df = get_combined_data(ravdess_path, crema_path, tess_path)
print(f"Total samples: {len(df)}")

# Label Mapping
label2id = {label: i for i, label in enumerate(df['emotion'].unique())}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['emotion'].map(label2id)

# Split (Stratified to ensure balanced classes)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['emotion'])

Downloading Datasets...
Downloading from https://www.kaggle.com/api/v1/datasets/download/orvile/ravdess-dataset?dataset_version_number=1...


100%|██████████| 23.9G/23.9G [05:25<00:00, 78.8MB/s]

Extracting files...





Using Colab cache for faster access to the 'cremad' dataset.
Using Colab cache for faster access to the 'toronto-emotional-speech-set-tess' dataset.
Total samples: 15494


# 3. Optimized Dataset Class

In [3]:
# Initialize Feature Extractor
model_id = "facebook/hubert-large-ls960-ft"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)

class SERDataset(Dataset):
    def __init__(self, dataframe, processor, target_sr=16000, max_seconds=5, augment=False):
        self.paths = dataframe['path'].tolist()
        self.labels = dataframe['label'].tolist()
        self.processor = processor
        self.target_sr = target_sr
        self.max_len = target_sr * max_seconds
        self.augment = augment  # New flag

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        speech, sr = librosa.load(self.paths[idx], sr=self.target_sr)

        # 1. Augmentation: Add Gaussian Noise (Only for training)
        if self.augment:
            noise_amp = 0.005 * np.random.uniform() * np.amax(speech)
            speech = speech + noise_amp * np.random.normal(size=speech.shape[0])

        # 2. Variable Length Padding (Truncate or Pad)
        if len(speech) > self.max_len:
            speech = speech[:self.max_len]
        else:
            speech = np.pad(speech, (0, self.max_len - len(speech)))

        inputs = self.processor(speech, sampling_rate=self.target_sr, return_tensors="pt")

        return {
            "input_values": inputs.input_values[0],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Update dataset initialization to use augmentation ONLY on train
train_ds = SERDataset(train_df, feature_extractor, augment=True) # Enable augment
test_ds = SERDataset(test_df, feature_extractor, augment=False)  # Keep validation clean

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

# 4. Model Setup with LoRA

In [4]:
from peft import get_peft_model, LoraConfig

# Load Base Model
model = HubertForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)

peft_config = LoraConfig(
    inference_mode=False,
    r=32,                 # Increase rank from 8 to 32 (more capacity)
    lora_alpha=64,        # Usually set alpha = 2 * r
    lora_dropout=0.1,

    # Target ALL attention and linear layers, not just q/v
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],

    modules_to_save=["classifier", "projector"]
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,555,398 || all params: 322,258,060 || trainable%: 2.0342


# 5. Training

In [5]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/hubert_lora_ser",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=20,
    warmup_ratio=0.1,
    fp16=True,
    save_total_limit=2,

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

    metric_for_best_model="accuracy",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

print("Starting Training with LoRA...")
trainer.train()

Downloading builder script: 0.00B [00:00, ?B/s]

Starting Training with LoRA...


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6845,0.922079,0.689462
2,0.7093,0.58826,0.803441
3,0.5941,0.666579,0.807312
4,0.483,0.470696,0.849462
5,0.4264,0.430605,0.868387
6,0.4039,0.472825,0.849892
7,0.3611,0.43602,0.859355
8,0.3516,0.413063,0.866667
9,0.3502,0.429292,0.874409
10,0.3104,0.53007,0.862366


TrainOutput(global_step=16480, training_loss=0.38724750963229576, metrics={'train_runtime': 16151.2312, 'train_samples_per_second': 16.307, 'train_steps_per_second': 1.02, 'total_flos': 4.074063736454398e+19, 'train_loss': 0.38724750963229576, 'epoch': 20.0})

# 6. Final Evaluation

In [6]:
preds = trainer.predict(test_ds)
print(classification_report(preds.label_ids, np.argmax(preds.predictions, axis=1), target_names=list(label2id.keys())))

              precision    recall  f1-score   support

     fearful       0.90      0.83      0.86       367
       happy       0.95      0.96      0.96       516
     neutral       0.85      0.96      0.91       368
         sad       0.81      0.82      0.81       367
       angry       0.94      0.95      0.94       367
     disgust       0.94      0.84      0.88       340

    accuracy                           0.90      2325
   macro avg       0.90      0.89      0.89      2325
weighted avg       0.90      0.90      0.90      2325



In [9]:
from google.colab import drive

# 2. Save the model
trainer.save_model("/content/drive/MyDrive/hubert_lora_ser_final")
feature_extractor.save_pretrained("/content/drive/MyDrive/hubert_lora_ser_final")

['/content/drive/MyDrive/hubert_lora_ser_final/preprocessor_config.json']