In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

def extract_log_mel_spectrogram(audio_path, n_mels=128, duration=3, sr=22050):
    signal, sr = librosa.load(audio_path, sr=sr, duration=duration)
    mel_spec = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=n_mels, fmax=8000)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec

import torch.nn.functional as F

class EmotionDataset(Dataset):
    def __init__(self, file_paths, labels, n_mels=128, duration=3, sr=22050, max_width=None):
        self.file_paths = file_paths
        self.labels = labels
        self.n_mels = n_mels
        self.duration = duration
        self.sr = sr
        self.max_width = max_width

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Extract log-mel spectrogram
        log_mel_spec = extract_log_mel_spectrogram(file_path, n_mels=self.n_mels, duration=self.duration, sr=self.sr)
        log_mel_spec = torch.tensor(log_mel_spec, dtype=torch.float32).unsqueeze(0)  # Add channel dim

        # Pad or truncate to max_width
        if self.max_width is not None:
            log_mel_spec = self._pad_or_truncate(log_mel_spec, self.max_width)

        return log_mel_spec, label

    def _pad_or_truncate(self, spec, max_width):
        _, _, width = spec.shape
        if width < max_width:
            # Pad with zeros along the time dimension
            pad_width = max_width - width
            spec = F.pad(spec, (0, pad_width))  # Pad last dimension (time)
        else:
            # Truncate along the time dimension
            spec = spec[:, :, :max_width]
        return spec



In [23]:
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm

def train_model(model, train_loader, val_loader, num_epochs, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    best_val_accuracy = 0.0

    for epoch in range(num_epochs):
        # Training loop
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation loop
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_accuracy = correct / total
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), "best_emotion_model.pth")

    print("Training complete. Best validation accuracy:", best_val_accuracy)


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm

# Example: Prepare dataset
emo_dict = {0: 'Angry', 1: 'Happy', 2: 'Neutral', 3: 'Sad', 4:'Surprise'}
emo_dict_r = {'Angry': 0, 'Happy': 1, 'Neutral': 2, 'Sad': 3, 'Surprise': 4}
file_paths = []
labels = []

for i in range(11,21):
    path_to_open = f"./dataset/EmotionSpeechDataset/00{i}/00{i}.txt"
    with open(path_to_open) as f:
        for line in f.readlines():
            label = line.split()[-1]
            file_name = line.split()[0]
            labels.append(emo_dict_r.get(label, 2))
            file = f"./dataset/EmotionSpeechDataset/00{i}/{label}/{file_name}.wav"
            file_paths.append(file)


print(len(file_paths))

# Train-test split
all_spectrograms = [extract_log_mel_spectrogram(path).shape[1] for path in tqdm(file_paths)]
max_width = max(all_spectrograms)
train_paths, val_paths, train_labels, val_labels = train_test_split(file_paths, labels, test_size=0.2, random_state=42)

# Create datasets and loaders
train_dataset = EmotionDataset(train_paths, train_labels, max_width=max_width)
val_dataset = EmotionDataset(val_paths, val_labels, max_width=max_width)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Train the model



17500




In [None]:
print(max_width)

In [24]:
from EmotionRecognitionModel import EmotionRecognitionModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = EmotionRecognitionModel(num_classes=5).to(device)
train_model(model, train_loader, val_loader, num_epochs=20, device=device)

cuda


100%|██████████| 875/875 [12:20<00:00,  1.18it/s]


Epoch 1/20, Loss: 1247.9646, Val Loss: 240.7929, Val Acc: 0.4794


100%|██████████| 875/875 [12:12<00:00,  1.19it/s]


Epoch 2/20, Loss: 823.6377, Val Loss: 152.6079, Val Acc: 0.7291


100%|██████████| 875/875 [12:12<00:00,  1.19it/s]


Epoch 3/20, Loss: 579.4706, Val Loss: 113.8442, Val Acc: 0.7983


100%|██████████| 875/875 [12:05<00:00,  1.21it/s]


Epoch 4/20, Loss: 444.2037, Val Loss: 111.4494, Val Acc: 0.8046


100%|██████████| 875/875 [12:07<00:00,  1.20it/s]


Epoch 5/20, Loss: 352.8500, Val Loss: 89.2825, Val Acc: 0.8500


100%|██████████| 875/875 [12:11<00:00,  1.20it/s]


Epoch 6/20, Loss: 295.1114, Val Loss: 85.5815, Val Acc: 0.8566


100%|██████████| 875/875 [10:00<00:00,  1.46it/s]


Epoch 7/20, Loss: 243.5537, Val Loss: 76.7304, Val Acc: 0.8789


100%|██████████| 875/875 [09:28<00:00,  1.54it/s]


Epoch 8/20, Loss: 205.4444, Val Loss: 64.3326, Val Acc: 0.9000


100%|██████████| 875/875 [09:28<00:00,  1.54it/s]


Epoch 9/20, Loss: 169.2570, Val Loss: 52.0982, Val Acc: 0.9214


100%|██████████| 875/875 [09:33<00:00,  1.53it/s]


Epoch 10/20, Loss: 133.4178, Val Loss: 67.1986, Val Acc: 0.8977


100%|██████████| 875/875 [09:26<00:00,  1.55it/s]


Epoch 11/20, Loss: 137.1244, Val Loss: 75.5279, Val Acc: 0.8877


100%|██████████| 875/875 [09:23<00:00,  1.55it/s]


Epoch 12/20, Loss: 112.7111, Val Loss: 52.3865, Val Acc: 0.9209


100%|██████████| 875/875 [09:23<00:00,  1.55it/s]


Epoch 13/20, Loss: 90.7871, Val Loss: 61.7235, Val Acc: 0.9114


100%|██████████| 875/875 [09:27<00:00,  1.54it/s]


Epoch 14/20, Loss: 84.0062, Val Loss: 63.7010, Val Acc: 0.9211


100%|██████████| 875/875 [09:27<00:00,  1.54it/s]


Epoch 15/20, Loss: 74.6583, Val Loss: 69.3418, Val Acc: 0.9100


100%|██████████| 875/875 [09:28<00:00,  1.54it/s]


Epoch 16/20, Loss: 62.6586, Val Loss: 62.6880, Val Acc: 0.9200


100%|██████████| 875/875 [09:32<00:00,  1.53it/s]


Epoch 17/20, Loss: 56.2329, Val Loss: 58.6429, Val Acc: 0.9243


100%|██████████| 875/875 [09:32<00:00,  1.53it/s]


Epoch 18/20, Loss: 53.2488, Val Loss: 46.4857, Val Acc: 0.9386


100%|██████████| 875/875 [09:26<00:00,  1.54it/s]


Epoch 19/20, Loss: 45.2096, Val Loss: 87.6889, Val Acc: 0.8826


100%|██████████| 875/875 [09:25<00:00,  1.55it/s]


Epoch 20/20, Loss: 46.4132, Val Loss: 45.9564, Val Acc: 0.9386
Training complete. Best validation accuracy: 0.9385714285714286


In [25]:
model.load_state_dict(torch.load("best_emotion_model.pth"))
model.eval()

  model.load_state_dict(torch.load("best_emotion_model.pth"))


EmotionRecognitionModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(2048, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=5, bias=True)
)

In [26]:
torch.save(model, "best.pt")