# Simple Neural Network for Emotion Recognition on nEMO with PyTorch

This notebook demonstrates how to train and evaluate a simple feedforward neural network on the nEMO dataset for speech emotion recognition using PyTorch and Torchaudio.

We will:
1. Load and preprocess the nEMO dataset.
2. Extract MFCC features from raw audio.
3. Define a simple neural network.
4. Train and evaluate the model, reporting accuracy and macro F1 score.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
from datasets import load_dataset, Audio
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAMPLE_RATE = 16_000
BATCH_SIZE = 16
LEARNING_RATE = 1e-3
NUM_EPOCHS = 25
N_MFCC = 40


In [3]:
raw_ds = load_dataset('amu-cai/nEMO', split='train')
ds = raw_ds.train_test_split(test_size=0.1, seed=42)
train_raw = ds['train']
eval_raw = ds['test']


train_raw = train_raw.cast_column('audio', Audio(sampling_rate=SAMPLE_RATE))
eval_raw = eval_raw.cast_column('audio', Audio(sampling_rate=SAMPLE_RATE))


emotions = sorted(set(train_raw['emotion']))
label2id = {e:i for i,e in enumerate(emotions)}
id2label = {i:e for e,i in label2id.items()}
NUM_LABELS = len(emotions)
print(f"Found {NUM_LABELS} emotion classes: {emotions}")

Found 6 emotion classes: ['anger', 'fear', 'happiness', 'neutral', 'sadness', 'surprised']


In [4]:
mfcc_transform = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=N_MFCC)

class NemoDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        waveform = torch.tensor(sample['audio']['array']).float()
        if waveform.ndim == 1:
            waveform = waveform.unsqueeze(0)
        mfcc = mfcc_transform(waveform)
        feats = mfcc.mean(dim=2).squeeze()
        label = label2id[sample['emotion']]
        return feats, label

torch_train = NemoDataset(train_raw)
torch_eval = NemoDataset(eval_raw)

train_loader = DataLoader(torch_train, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(torch_eval, batch_size=BATCH_SIZE)



In [5]:
class SimpleEmotionNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

model = SimpleEmotionNet(input_dim=N_MFCC, hidden_dim=128, num_classes=NUM_LABELS).to(DEVICE)

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [7]:
best_val_loss = float('inf')
patience = 3
counter = 0

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for feats, labels in train_loader:
        feats, labels = feats.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(feats)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * feats.size(0)
    avg_loss = total_loss / len(train_loader.dataset)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for feats, labels in eval_loader:
            feats, labels = feats.to(DEVICE), labels.to(DEVICE)
            outputs = model(feats)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * feats.size(0)
    avg_val_loss = val_loss / len(eval_loader.dataset)

    print(f"Epoch {epoch}/{NUM_EPOCHS}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), 'best_model.pt')  # Save best model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            break

Epoch 1/25, Train Loss: 3.9447, Val Loss: 1.4273
Epoch 2/25, Train Loss: 1.4264, Val Loss: 1.2533
Epoch 3/25, Train Loss: 1.2703, Val Loss: 1.1281
Epoch 4/25, Train Loss: 1.1404, Val Loss: 1.0104
Epoch 5/25, Train Loss: 1.0820, Val Loss: 0.9117
Epoch 6/25, Train Loss: 1.0086, Val Loss: 0.8519
Epoch 7/25, Train Loss: 0.9508, Val Loss: 0.7788
Epoch 8/25, Train Loss: 0.8993, Val Loss: 0.7222
Epoch 9/25, Train Loss: 0.8589, Val Loss: 0.7925
Epoch 10/25, Train Loss: 0.8402, Val Loss: 0.6491
Epoch 11/25, Train Loss: 0.7944, Val Loss: 0.6352
Epoch 12/25, Train Loss: 0.7683, Val Loss: 0.5876
Epoch 13/25, Train Loss: 0.7361, Val Loss: 0.6036
Epoch 14/25, Train Loss: 0.7436, Val Loss: 0.5471
Epoch 15/25, Train Loss: 0.7083, Val Loss: 0.5476
Epoch 16/25, Train Loss: 0.6948, Val Loss: 0.5500
Epoch 17/25, Train Loss: 0.6769, Val Loss: 0.5230
Epoch 18/25, Train Loss: 0.6644, Val Loss: 0.5290
Epoch 19/25, Train Loss: 0.6495, Val Loss: 0.5147
Epoch 20/25, Train Loss: 0.6489, Val Loss: 0.5140
Epoch 21/

In [9]:

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for feats, labels in eval_loader:
        feats = feats.to(DEVICE)
        logits = model(feats)
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='macro')
print(f"Evaluation Accuracy: {acc:.4f}")
print(f"Evaluation Macro F1: {f1:.4f}")

Evaluation Accuracy: 0.8441
Evaluation Macro F1: 0.8388
