In [1]:
import os
import json
import librosa
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
from torch.utils.data import Dataset, DataLoader

In [3]:
class AudioMNISTDataset(Dataset):
    def __init__(self, data_path):
        self.data_path = data_path
        self.file_list = os.listdir(self.data_path)
        
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        file_name = os.path.join(self.data_path, self.file_list[idx])
        data = np.load(file_name)
        
        # print(self.file_list[idx])
        label, subfolder_index, data_index = self.file_list[idx].split("_")
        
        # Extracting data
        metadata = data["metadata"]
        audio = data["audio"]
        mel_spec = data["mel_spec"]
        mel_spec_db = data["mel_spec_db"]

        # Assuming your data is a numpy array, you can convert it to a PyTorch tensor
        tensor_audio = torch.from_numpy(audio).reshape(-1, 1)
        tensor_meta = torch.tensor(metadata).reshape(-1, 1)
        tensor_label = torch.nn.functional.one_hot(torch.tensor(int(label)), num_classes=10).reshape(-1, 1)
        tensor_mel_spec = torch.tensor(mel_spec)
        tensor_mel_spec_db = torch.tensor(mel_spec_db)
        return tensor_audio, tensor_meta, tensor_label, tensor_mel_spec, tensor_mel_spec_db


In [4]:
dataset = AudioMNISTDataset(data_path="./AudioMNIST/preprocessed_data")

In [5]:
# Create a DataLoader for batching and shuffling
batch_size = 32
shuffle = True

In [6]:
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

### **Baseline Model**

In [32]:
# Define the Generator
class Generator(nn.Module):
    def __init__(self, noise_dim, metadata_dim, audio_dim):
        super(Generator, self).__init__()
        self.noise_dim = noise_dim
        self.metadata_dim = metadata_dim
        self.audio_dim = audio_dim

        # Generator layers
        self.fc1 = nn.Linear(noise_dim + metadata_dim, 128)
        self.fc2 = nn.Linear(128, audio_dim)

    def forward(self, noise, metadata):
        # Concatenate noise and metadata
        x = torch.cat((noise, metadata), dim=1)
        # Pass through generator layers
        x = torch.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))  # Use tanh activation for audio generation
        return x

In [33]:
# Define the Discriminator
class Discriminator(nn.Module):
    def __init__(self, audio_dim):
        super(Discriminator, self).__init__()
        self.audio_dim = audio_dim

        # Discriminator layers for real/fake classification
        self.fc1 = nn.Linear(audio_dim, 128)
        self.fc2 = nn.Linear(128, 1)

        # Discriminator layers for classifying the generated number
        self.fc3 = nn.Linear(audio_dim, 128)
        self.fc4 = nn.Linear(128, 10)  # Output is 10 classes (0-9)

    def forward(self, audio):
        # Discriminator for real/fake classification
        # print(audio.shape)
        audio = torch.transpose(audio, 1, 2)
        x1 = torch.relu(self.fc1(audio))
        validity = torch.sigmoid(self.fc2(x1))

        # Discriminator for classifying the generated number
        x2 = torch.relu(self.fc3(audio))
        generated_number = torch.softmax(self.fc4(x2), dim=1)

        return validity, generated_number

In [104]:
# Define the training function for GAN
def train_gan(generator, discriminator, data_loader, num_epochs, noise_dim, metadata_dim, device):
    for epoch in range(num_epochs):
        for i, (real_audio, meta, labels, mel_spec, mel_spec_db) in enumerate(data_loader):
            # Move data to device
            real_audio = real_audio.to(device)
            labels = labels.to(device)

            # Generate random noise and metadata
            noise = torch.randn(real_audio.size(0), noise_dim, device=device)
            metadata = torch.randint(0, 10, (real_audio.size(0), metadata_dim), dtype=torch.float32, device=device)

            # Generate fake audio samples
            generated_audio = generator(noise, metadata)

            # Train Discriminator
            optimizer_D.zero_grad()

            # Real audio
            real_labels = torch.ones(real_audio.size(0), 1, device=device)
            real_validity, real_generated_number = discriminator(real_audio)
            
            # Squeeze the tensor
            real_validity = torch.squeeze(real_validity, dim=-1)
            # print(real_validity.shape)
            
            d_loss_real = adversarial_loss(real_validity, real_labels)
            # print(real_generated_number.shape)
            # print(labels.shape)
            real_generated_number = torch.transpose(real_generated_number, 1, 2)
            d_loss_real_classification = classification_loss(real_generated_number, labels.float())

            # Fake audio
            fake_labels = torch.zeros(real_audio.size(0), 1, device=device)

            generated_audio = torch.unsqueeze(generated_audio, dim=-1)
            fake_validity, fake_generated_number = discriminator(generated_audio.detach())
            
            # Squeeze the tensor
            fake_validity = torch.squeeze(fake_validity, dim=-1)
            
            d_loss_fake = adversarial_loss(fake_validity, fake_labels)
            
            fake_generated_number = torch.transpose(fake_generated_number, 1, 2)
            d_loss_fake_classification = classification_loss(fake_generated_number, labels.float())

            d_loss = d_loss_real + d_loss_fake + d_loss_real_classification + d_loss_fake_classification
            d_loss.backward()
            optimizer_D.step()

            # Train Generator
            optimizer_G.zero_grad()
            validity, generated_number = discriminator(generated_audio)
            validity = torch.squeeze(validity, dim=-1)
            
            g_loss = adversarial_loss(validity, real_labels)
            
            generated_number = torch.transpose(generated_number, 1, 2)
            # g_loss_classification = classification_loss(generated_number, labels.float())

            # g_loss_total = g_loss + g_loss_classification
            g_loss_total = g_loss
            g_loss_total.backward()
            optimizer_G.step()

            # Print training progress
            if i % 100 == 0:
                print(
                    "[Epoch %d/%d] [Batch %d/%d] [D loss: %.4f] [G loss: %.4f]" # [D classification loss: %.4f] [G classification loss: %.4f]"
                    % (epoch, num_epochs, i, len(data_loader), d_loss.item(), g_loss.item()) #, d_loss_real_classification.item(), g_loss_classification.item())
                )
                

In [105]:
# Define hyperparameters
input_size = 8  # Size of input noise vector
noise_dim = input_size
metadata_size = 5  # Size of metadata array
metadata_dim = metadata_size
output_size = dataset[0][0].shape[0]  # Size of output audio vector
num_epochs = 2
batch_size = 64
learning_rate = 0.0002
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [106]:
# Initialize models
generator = Generator(input_size, metadata_size, output_size).to(device)
discriminator = Discriminator(output_size).to(device)


In [107]:
adversarial_loss = nn.BCELoss()  # Binary cross-entropy loss for real/fake classification
classification_loss = nn.CrossEntropyLoss()  # Cross-entropy loss for classifying the generated number

optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

In [108]:
train_gan(generator, discriminator, dataloader, num_epochs, noise_dim, metadata_dim, device)

[Epoch 0/2] [Batch 0/938] [D loss: 5.9786] [G loss: 9.6873]
[Epoch 0/2] [Batch 100/938] [D loss: 5.2992] [G loss: 7.6101]
[Epoch 0/2] [Batch 200/938] [D loss: 7.1094] [G loss: 28.0421]
[Epoch 0/2] [Batch 300/938] [D loss: 5.2196] [G loss: 25.0088]
[Epoch 0/2] [Batch 400/938] [D loss: 5.1895] [G loss: 6.8425]
[Epoch 0/2] [Batch 500/938] [D loss: 5.1622] [G loss: 5.8792]
[Epoch 0/2] [Batch 600/938] [D loss: 5.1313] [G loss: 5.1715]
[Epoch 0/2] [Batch 700/938] [D loss: 5.0916] [G loss: 5.2993]
[Epoch 0/2] [Batch 800/938] [D loss: 5.0500] [G loss: 5.3134]
[Epoch 0/2] [Batch 900/938] [D loss: 5.0136] [G loss: 5.4749]
[Epoch 1/2] [Batch 0/938] [D loss: 4.9912] [G loss: 6.1473]
[Epoch 1/2] [Batch 100/938] [D loss: 4.9618] [G loss: 5.3526]
[Epoch 1/2] [Batch 200/938] [D loss: 4.9384] [G loss: 7.8755]
[Epoch 1/2] [Batch 300/938] [D loss: 4.9436] [G loss: 25.8652]
[Epoch 1/2] [Batch 400/938] [D loss: 4.9574] [G loss: 7.7471]
[Epoch 1/2] [Batch 500/938] [D loss: 4.9199] [G loss: 7.1869]
[Epoch 1/

### **Upgrade1: Dynamic Time Warping Loss**

In [122]:
from fastdtw import fastdtw

In [123]:
class DynamicTimeWarpingLoss(torch.nn.Module):
    def __init__(self):
        super(DynamicTimeWarpingLoss, self).__init__()

    def forward(self, generated_audio, real_audio):
        # Convert torch tensors to numpy arrays
        generated_audio_np = generated_audio.detach().cpu().numpy().squeeze()
        real_audio_np = real_audio.detach().cpu().numpy().squeeze()
        
        # Compute DTW distance
        dtw_distance, _ = fastdtw(generated_audio_np, real_audio_np)
        
        # Convert distance to torch tensor
        dtw_distance_tensor = torch.tensor(dtw_distance, dtype=torch.float32, device=generated_audio.device)
        
        return dtw_distance_tensor

In [124]:
# Example usage:
loss_fn = DynamicTimeWarpingLoss()

# Generate example sequences x and y
x = torch.randn(100)  # Example sequence 1
y = torch.randn(150)  # Example sequence 2

# Compute DTW loss
dtw_loss = loss_fn(x, y)
print("DTW Loss:", dtw_loss.item())

DTW Loss: 85.52569580078125


In [125]:
# Define the Generator
class GeneratorDTW(nn.Module):
    def __init__(self, noise_dim, metadata_dim, audio_dim):
        super(GeneratorDTW, self).__init__()
        self.noise_dim = noise_dim
        self.metadata_dim = metadata_dim
        self.audio_dim = audio_dim

        # Generator layers
        self.fc1 = nn.Linear(noise_dim + metadata_dim, 128)
        self.fc2 = nn.Linear(128, audio_dim)

    def forward(self, noise, metadata):
        # Concatenate noise and metadata
        x = torch.cat((noise, metadata), dim=1)
        # Pass through generator layers
        x = torch.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))  # Use tanh activation for audio generation
        return x

In [126]:
# Define the Discriminator
class DiscriminatorDTW(nn.Module):
    def __init__(self, audio_dim):
        super(DiscriminatorDTW, self).__init__()
        self.audio_dim = audio_dim

        # Discriminator layers for real/fake classification
        self.fc1 = nn.Linear(audio_dim, 128)
        self.fc2 = nn.Linear(128, 1)

        # Discriminator layers for classifying the generated number
        self.fc3 = nn.Linear(audio_dim, 128)
        self.fc4 = nn.Linear(128, 10)  # Output is 10 classes (0-9)

    def forward(self, audio):
        # Discriminator for real/fake classification
        # print(audio.shape)
        audio = torch.transpose(audio, 1, 2)
        x1 = torch.relu(self.fc1(audio))
        validity = torch.sigmoid(self.fc2(x1))

        # Discriminator for classifying the generated number
        x2 = torch.relu(self.fc3(audio))
        generated_number = torch.softmax(self.fc4(x2), dim=1)

        return validity, generated_number

In [142]:
# Define the training function for GAN
def train_gan_dtw(generator, discriminator, data_loader, num_epochs, noise_dim, metadata_dim, device):
    for epoch in range(num_epochs):
        for i, (real_audio, meta, labels, mel_spec, mel_spec_db) in enumerate(data_loader):
            # Move data to device
            real_audio = real_audio.to(device)
            labels = labels.to(device)

            # Generate random noise and metadata
            noise = torch.randn(real_audio.size(0), noise_dim, device=device)
            metadata = torch.randint(0, 10, (real_audio.size(0), metadata_dim), dtype=torch.float32, device=device)

            # Generate fake audio samples
            generated_audio = generator(noise, metadata)

            # Train Discriminator
            optimizer_D.zero_grad()

            # Real audio
            real_labels = torch.ones(real_audio.size(0), 1, device=device)
            real_validity, real_generated_number = discriminator(real_audio)
            
            # Squeeze the tensor
            real_validity = torch.squeeze(real_validity, dim=-1)
            # print(real_validity.shape)
            
            d_loss_real = adversarial_loss(real_validity, real_labels)
            # print(real_generated_number.shape)
            # print(labels.shape)
            real_generated_number = torch.transpose(real_generated_number, 1, 2)
            d_loss_real_classification = classification_loss(real_generated_number, labels.float())

            # Fake audio
            fake_labels = torch.zeros(real_audio.size(0), 1, device=device)

            generated_audio = torch.unsqueeze(generated_audio, dim=-1)
            fake_validity, fake_generated_number = discriminator(generated_audio.detach())
            
            # Squeeze the tensor
            fake_validity = torch.squeeze(fake_validity, dim=-1)
            
            d_loss_fake = adversarial_loss(fake_validity, fake_labels)
            
            fake_generated_number = torch.transpose(fake_generated_number, 1, 2)
            d_loss_fake_classification = classification_loss(fake_generated_number, labels.float())

            d_loss = d_loss_real + d_loss_fake + d_loss_real_classification + d_loss_fake_classification
            d_loss.backward()
            optimizer_D.step()

            # Train Generator
            optimizer_G.zero_grad()
            validity, generated_number = discriminator(generated_audio)
            validity = torch.squeeze(validity, dim=-1)
            
            # print(generated_audio.shape)
            # print(real_audio.shape)
            
            g_loss = g_adversarial_loss(generated_audio, real_audio)
            
            generated_number = torch.transpose(generated_number, 1, 2)
            # g_loss_classification = classification_loss(generated_number, labels.float())

            # g_loss_total = g_loss + g_loss_classification
            g_loss_total = g_loss
            g_loss_total.backward()
            optimizer_G.step()

            # Print training progress
            if i % 100 == 0:
                print(
                    "[Epoch %d/%d] [Batch %d/%d] [D loss: %.4f] [G loss: %.4f]" # [D classification loss: %.4f] [G classification loss: %.4f]"
                    % (epoch, num_epochs, i, len(data_loader), d_loss.item(), g_loss.item()) #, d_loss_real_classification.item(), g_loss_classification.item())
                )
                

In [143]:
# Define hyperparameters
input_size = 8  # Size of input noise vector
noise_dim = input_size
metadata_size = 5  # Size of metadata array
metadata_dim = metadata_size
output_size = dataset[0][0].shape[0]  # Size of output audio vector
num_epochs = 2
batch_size = 64
learning_rate = 0.0002
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [144]:
# Initialize models
generator = GeneratorDTW(input_size, metadata_size, output_size).to(device)
discriminator = DiscriminatorDTW(output_size).to(device)


In [145]:
adversarial_loss = nn.BCELoss()  # Binary cross-entropy loss for real/fake classification\
g_adversarial_loss = DynamicTimeWarpingLoss()
classification_loss = nn.CrossEntropyLoss()  # Cross-entropy loss for classifying the generated number

optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

In [146]:
train_gan_dtw(generator, discriminator, dataloader, num_epochs, noise_dim, metadata_dim, device)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn