In [1]:
AUDIO_DIR = "/Users/zainhazzouri/projects/Bachelor_Thesis/Data/Kaggle"
SAMPLE_RATE = 22050 # sample rate of the audio file
bit_depth = 16 # bit depth of the audio file
hop_length = 512
n_mfcc = 20 # number of MFCCs features
n_fft=1024, # window size
n_mels = 256 # number of mel bands to generate
win_length = None # window length


# Training parameters
batch_size = 16
learning_rate = 0.001
num_epochs = 20




In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import glob
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import nbimporter
from CNN_Model import AttentionResidualWaveUNet

class SpeechMusicDataset(Dataset):
    def __init__(self, music_waves, speech_waves, transform=None):
        self.music_waves = music_waves
        self.speech_waves = speech_waves
        self.transform = transform
        self.file_list = self.music_waves + self.speech_waves

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        waveform, _ = torchaudio.load(file_path)
        label = 0 if 'music_wav' in file_path else 1

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, label

# Preprocessing function for the dataset
def preprocess(waveform, target_length=8000, sample_rate=SAMPLE_RATE, n_mfcc=n_mfcc):
    waveform_length = waveform.size(1)

    if waveform_length < target_length:
        num_padding = target_length - waveform_length
        padding = torch.zeros(1, num_padding)
        waveform = torch.cat((waveform, padding), 1)
    elif waveform_length > target_length:
        waveform = waveform[:, :target_length]

    mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc)(waveform)
    return mfcc

# Set device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_built():  # if you have apple silicon mac
    device = "mps"  # if it doesn't work try device = torch.device('mps')
else:
    device = "cpu"
print(f"Using {device}")

# Set the path to the folder containing the music and speech datasets
AUDIO_DIR = "/Users/zainhazzouri/projects/Bachelor_Thesis/Data/Kaggle/"

# Load the dataset
music_waves = glob.glob(AUDIO_DIR + "music_wav" + "/*.wav")
speech_waves = glob.glob(AUDIO_DIR + "speech_wav" + "/*.wav")
transform = preprocess

dataset = SpeechMusicDataset(music_waves, speech_waves, transform=transform)






def pad_waveform(waveform, desired_length):
    if waveform.shape[-1] < desired_length:
        padding = desired_length - waveform.shape[-1]
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    return waveform






# Load the dataset
music_waves = glob.glob(AUDIO_DIR + "music_wav" + "/*.wav")
speech_waves = glob.glob(AUDIO_DIR + "speech_wav" + "/*.wav")
transform = preprocess

dataset = SpeechMusicDataset(music_waves, speech_waves, transform=transform)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])








# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss, and optimizer
model = AttentionResidualWaveUNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


Using mps


In [5]:
# Training loop
for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}/{num_epochs}")

    model.train()
    running_loss = 0.0

    # Add tqdm progress bar
    for i, (inputs, targets) in enumerate(tqdm(train_loader, desc="Training", ncols=100)):
        inputs = inputs.to(device)
        targets = targets.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Calculate loss
        loss = criterion(outputs, targets)

        # Backward pass
        loss.backward()

        # Optimize
        optimizer.step()

        # Update loss
        running_loss += loss.item()

    # Calculate average loss for the epoch
    epoch_loss = running_loss / len(train_loader)
    print(f"Loss: {epoch_loss:.4f}")

print("Training finished.")



Epoch: 1/20


Training:   0%|                                                               | 0/7 [00:02<?, ?it/s]


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 4 but got size 5 for tensor number 1 in the list.

In [10]:
# Evaluation function

#1. Average validation loss: This metric is calculated using the same loss function (`criterion`) used during training, which is CrossEntropyLoss in this case. The average validation loss is computed by summing the losses for all validation samples and then dividing by the number of validation samples. A lower average validation loss indicates better performance.
#
# 2. Validation accuracy: This metric measures the percentage of correctly classified samples in the validation set. The accuracy is calculated by counting the number of correct predictions, i.e., when the predicted label matches the true label, and then dividing by the total number of validation samples. A higher validation accuracy indicates better performance.
#
# These two metrics together provide a good evaluation of the model's performance on the validation set. The average validation loss helps assess the model's ability to minimize the loss function, while the validation accuracy measures how well the model is classifying the samples.


def evaluate(val_loader, model, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate loss
            loss = criterion(outputs, targets)

            # Update loss
            running_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    # Calculate average loss and accuracy
    avg_loss = running_loss / len(val_loader)
    accuracy = 100 * correct / total

    return avg_loss, accuracy

# Evaluate the model
val_loss, val_accuracy = evaluate(val_loader, model, criterion, device)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.2f}%")

Validation Loss: 0.7501
Validation Accuracy: 38.46%


In [5]:

# Save the trained model
torch.save(model.state_dict(), "waveunet_speech_music_discrimination.pth")
print("Model saved.")




Model saved.


In [6]:
# Show model summary
try:
    from torchsummary import summary
    print("\nModel summary:")

    original_device = device
    if device == 'mps': # because MPS is not supported by torchsummary
        device = 'cpu'
    model.to(device)

    summary(model, input_size=(1, 40, 431), device=device)

    if original_device == 'mps': # Restore original device
        device = original_device
        model.to(device)

except ImportError:
    print("\nPlease install torchsummary to display the model summary. Use `pip install torchsummary`.")


Model summary:
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 40, 20, 216]           1,040
              ReLU-2          [-1, 40, 20, 216]               0
            Conv2d-3          [-1, 80, 10, 108]          80,080
              ReLU-4          [-1, 80, 10, 108]               0
            Conv2d-5           [-1, 160, 5, 54]         320,160
              ReLU-6           [-1, 160, 5, 54]               0
   ConvTranspose2d-7          [-1, 80, 10, 108]         320,080
              ReLU-8          [-1, 80, 10, 108]               0
   ConvTranspose2d-9          [-1, 40, 20, 216]          80,040
             ReLU-10          [-1, 40, 20, 216]               0
  ConvTranspose2d-11          [-1, 40, 40, 432]          40,040
             ReLU-12          [-1, 40, 40, 432]               0
AdaptiveAvgPool2d-13             [-1, 40, 1, 1]               0
           Linear-14   