In [15]:
import os
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class RavdessDataset(Dataset):
    def __init__(self, file_list, labels, transform=None):
        self.file_list = file_list
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = self.labels[idx]

        waveform, sample_rate = torchaudio.load(file_path)
        
        if self.transform:
            waveform = self.transform(waveform)
        
        return waveform, label

def load_data(dataset_path):
    file_list = []
    labels = []
    
    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
                
                # Extract emotion label from filename
                emotion_label = int(file.split('-')[2])  # Extract the third component in the filename
                labels.append(emotion_label)
    
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    
    return file_list, labels

dataset_path = "../data/RAVDESS"
file_list, labels = load_data(dataset_path)

train_files, test_files, train_labels, test_labels = train_test_split(file_list, labels, test_size=0.2, random_state=42)

transform = torchaudio.transforms.MelSpectrogram(sample_rate=22050, n_mels=128)

train_dataset = RavdessDataset(train_files, train_labels, transform=transform)
test_dataset = RavdessDataset(test_files, test_labels, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [None]:
import torch
import torch.nn as nn

from torch.optim.adam import Adam
from sklearn.metrics import accuracy_score

class EmotionRecognitionModel(nn.Module):
    def __init__(self):
        super(EmotionRecognitionModel, self).__init__()
        
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        
        example_input = torch.zeros(1, 1, 128, 256)  # Example input size
        with torch.no_grad():
            example_output = self.feature_extractor(example_input)
            self.flattened_size = example_output.shape[1] * example_output.shape[2] * example_output.shape[3]

        self.classifier = nn.Sequential(
            nn.Linear(self.flattened_size, 128),
            nn.ReLU(),
            nn.Linear(128, 8)  # 8 emotions
        )

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = Adam(self.parameters(), lr=0.001)

    def forward(self, x):
        x = self.feature_extractor(x)
        print(x.shape, self.flattened_size)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def train_model(self, train_loader, epochs=10):
        """Train the model using the provided training DataLoader."""
        self.train()
        for epoch in range(epochs):
            running_loss = 0.0
            for i, (inputs, labels) in enumerate(train_loader):
                inputs = inputs.unsqueeze(1)

                self.optimizer.zero_grad()
                outputs = self(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()
                if i % 10 == 9:
                    print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}], Loss: {running_loss / 10:.4f}')
                    running_loss = 0.0

        print('Finished Training')

    def evaluate_model(self, test_loader):
        """Evaluate the model using the provided test DataLoader."""
        self.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs = inputs.unsqueeze(1)
                outputs = self(inputs)
                _, predicted = torch.max(outputs.data, 1)
                all_preds.extend(predicted.numpy())
                all_labels.extend(labels.numpy())

        accuracy = accuracy_score(all_labels, all_preds)
        print(f'Accuracy: {accuracy:.4f}')



model = EmotionRecognitionModel()

model.train_model(train_loader, epochs=10)
model.evaluate_model(test_loader)


In [None]:
import shap

explainer = shap.DeepExplainer(model, torch.stack([train_dataset[i][0].unsqueeze(0) for i in range(100)]).unsqueeze(1))
sample_input, _ = test_dataset[0]
sample_input = sample_input.unsqueeze(0).unsqueeze(0)
shap_values = explainer.shap_values(sample_input)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Convert SHAP values to numpy array
shap_values = np.array(shap_values[0]).squeeze()

# Plotting the original waveform and the heatmap
plt.figure(figsize=(10, 4))
plt.plot(sample_input.squeeze().numpy(), label='Original Audio Waveform')
plt.imshow(shap_values.T, aspect='auto', alpha=0.5, cmap='coolwarm', extent=[0, sample_input.shape[-1], -1, 1])
plt.colorbar(label='SHAP Value')
plt.title('SHAP Heatmap on Audio Waveform')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.show()