In [56]:
!mkdir -p ~/.kaggle
!mv "/content/kaggle (1).json" ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

mv: cannot stat '/content/kaggle (1).json': No such file or directory


In [57]:

# Collect TESS audio dataset from Kaggle
!kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess --unzip

Dataset URL: https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess
License(s): Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
Downloading toronto-emotional-speech-set-tess.zip to /content
 81% 345M/428M [00:02<00:01, 78.8MB/s]
100% 428M/428M [00:02<00:00, 152MB/s] 


In [58]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
# This allows normalization for the specgrams which allows for clearer visualizations
from matplotlib.colors import Normalize
import seaborn as sns

import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [59]:
# Put the tess directory in a list
tess = os.listdir('/content/TESS Toronto emotional speech set data/')
# Make a list for emotion labels and a list for path to audio files
emotions = []
paths = []
# Loop through all the audio file directories
for dir in tess:
    # Loop through all the files in each directory
    for file in os.listdir('/content/TESS Toronto emotional speech set data/' + dir):
        # Extract the emotion label from the file name
        emotion = file.split('.')[0]
        emotion = emotion.split('_')[2]
        if emotion == 'ps':
            emotion = 'surprise'
        elif emotion == 'sad':
            emotion = 'sadness'
        elif emotion == 'disgust':
            emotion = 'disgust'
        elif emotion == 'angry':
            emotion = 'anger'
        elif emotion == 'happy':
            emotion = 'happiness'
        elif emotion == 'neutral':
            emotion = 'neutral'
        elif emotion == 'fear':
            emotion = 'fear'
        else:
            emotion = 'Unknown'
        # Extract the path
        path = '/content/TESS Toronto emotional speech set data/' + dir + '/' + file
        # Append the emotion and path to their lists
        emotions.append(emotion)
        paths.append(path)

In [60]:
df = pd.DataFrame(emotions, columns = ['Emotions'])

In [61]:
df['Paths'] = paths

In [62]:
df.shape

(2800, 2)

In [63]:
import torch
import soundfile as sf
import torchaudio
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [64]:
TARGET_SR = 16000

def ensure_16k(path, out_path=None):
    y, sr = librosa.load(path, sr=None)
    if sr != TARGET_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
    if out_path is not None:
        sf.write(out_path, y, TARGET_SR)
        return out_path
    else:
        return path

In [65]:
resampled_dir = "/content/drive/MyDrive/resampled_audio_16k"
os.makedirs(resampled_dir, exist_ok=True)

new_paths = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    fname = os.path.basename(row['Paths'])
    new_path = os.path.join(resampled_dir, fname)
    new_paths.append(ensure_16k(row['Paths'], new_path))

df['Paths'] = new_paths

  0%|          | 0/2800 [00:00<?, ?it/s]

In [66]:
df.shape

(2800, 2)

In [67]:
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['Emotions'])

label2id = {label: i for i, label in enumerate(le.classes_)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(label2id)

print("Label mapping:", label2id)

Label mapping: {'anger': 0, 'disgust': 1, 'fear': 2, 'happiness': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}


In [68]:
hf_ds = Dataset.from_pandas(df[['Paths', 'Emotions']])
hf_ds = hf_ds.train_test_split(test_size=0.2, seed=42)
train_ds = hf_ds['train']
eval_ds = hf_ds['test']

In [69]:
model_name = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
def prepare_batch(batch):
    speech, sr = torchaudio.load(batch["Paths"])
    speech = speech.squeeze().numpy()

    inputs = processor(
        speech,
        sampling_rate=TARGET_SR,
        padding="longest",   # ✅ dynamically pad
        truncation=True,
        max_length=16000*5,
        return_tensors="pt",
        return_attention_mask=True
    )

    batch["input_values"] = inputs.input_values[0]
    batch["attention_mask"] = inputs.attention_mask[0]
    batch["labels"] = torch.tensor(label2id[batch["Emotions"]])
    return batch

train_ds = train_ds.map(prepare_batch, remove_columns=["Paths","Emotions"])
eval_ds = eval_ds.map(prepare_batch, remove_columns=["Paths","Emotions"])


Map:   0%|          | 0/2240 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

In [71]:
training_args = TrainingArguments(
    output_dir="./wav2vec2_emotion_model",
    per_device_train_batch_size=4,    # lower if OOM
    per_device_eval_batch_size=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    learning_rate=1e-4,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=torch.cuda.is_available()
)

In [72]:
# =========================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [73]:
from dataclasses import dataclass
from typing import Dict, List, Union
import torch

@dataclass
class CustomDataCollator:
    processor: any

    def __call__(self, features):
        # Extract input_values and labels
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [f["labels"] for f in features]

        # Pad variable-length audio sequences
        batch = self.processor.pad(
            input_features,
            padding=True,
            return_tensors="pt"
        )

        # Convert labels to tensor
        batch["labels"] = torch.tensor(label_features, dtype=torch.long)
        return batch



data_collator = CustomDataCollator(processor=processor)

In [74]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [75]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1466,0.00388,1.0
2,0.0333,0.001056,1.0
3,0.0007,0.000361,1.0
4,0.0003,0.000232,1.0
5,0.0003,0.0002,1.0


TrainOutput(global_step=2800, training_loss=0.15182550223810332, metrics={'train_runtime': 874.1326, 'train_samples_per_second': 12.813, 'train_steps_per_second': 3.203, 'total_flos': 2.43584598644166e+17, 'train_loss': 0.15182550223810332, 'epoch': 5.0})

In [77]:
# Evaluate on evaluation set & print classification report
predictions = trainer.predict(eval_ds)
preds = np.argmax(predictions.predictions, axis=-1)
true = predictions.label_ids
print("Accuracy:", accuracy_score(true, preds))
print(classification_report(true, preds, target_names=le.classes_))


Accuracy: 1.0
              precision    recall  f1-score   support

       anger       1.00      1.00      1.00        92
     disgust       1.00      1.00      1.00        80
        fear       1.00      1.00      1.00        80
   happiness       1.00      1.00      1.00        73
     neutral       1.00      1.00      1.00        72
     sadness       1.00      1.00      1.00        79
    surprise       1.00      1.00      1.00        84

    accuracy                           1.00       560
   macro avg       1.00      1.00      1.00       560
weighted avg       1.00      1.00      1.00       560

