In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [19]:
class MultiLabelMLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        """
        Initializes the MultiLabelMLP model with multiple hidden layers.

        Parameters:
        - input_size: The size of the input features.
        - hidden_sizes: A list of sizes for each hidden layer.
        - num_classes: The number of classes (output size).
        """
        super(MultiLabelMLP, self).__init__()
        
        # Create the first hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_sizes[0])])
        
        # Add any additional hidden layers
        layer_sizes = zip(hidden_sizes[:-1], hidden_sizes[1:])
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])
        
        # Create the output layer
        self.output_layer = nn.Linear(hidden_sizes[-1], num_classes)
    
    def forward(self, x):
        # Apply each hidden layer with ReLU activation
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
        
        # Output layer
        x = self.output_layer(x)
        return x

In [20]:


# Assuming MultiLabelMLP is defined as in the previous example

class FrameDataset(Dataset):
    def __init__(self, npz_path):
        self.data = np.load(npz_path)
        self.keys = [k for k in self.data.keys() if k.endswith('_data')]
    
    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        data_key = self.keys[idx]
        data = self.data[data_key]
        labels = self.data[f'{data_key[:-5]}_labels']
        # Assuming data is already normalized and just needs to be reshaped/flattened
        data = data.reshape(data.shape[0], -1)  # Reshape to (n, 96*87)
        return torch.tensor(data, dtype=torch.float32), torch.tensor(labels, dtype=torch.float32)

def train_model(model, dataloader, criterion, optimizer, epochs=5):
    for epoch in range(epochs):
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0
        for data, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            predicted = torch.sigmoid(outputs) > 0.5  # Apply sigmoid and threshold
            correct_predictions += (predicted == labels).float().sum()
            total_predictions += torch.numel(labels)

        accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch+1}, Loss: {total_loss}, Accuracy: {accuracy.item()}')

npz_path = 'processed_audio.npz'
dataset = FrameDataset(npz_path)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define model, criterion, and optimizer
input_size = 96 * 87  # Flattened frame size
hidden_sizes = [128, 64]  # Example hidden layer sizes
num_classes = 10  # Adjust based on your label dimensionality
model = MultiLabelMLP(input_size, hidden_sizes, num_classes)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, dataloader, criterion, optimizer, epochs=10)

RuntimeError: stack expects each tensor to be equal size, but got [3, 8352] at entry 0 and [5, 8352] at entry 1