In [20]:
from src.utils import *

In [21]:
# delete 75% of files in the directory
import os
import random

def delete_files(directory, percentage=0.75):
    files = os.listdir(directory)
    files = random.sample(files, int(len(files) * percentage))
    for file in files:
        os.remove(os.path.join(directory, file))

# delete_files('data/train/audio_yes_no/no', 0.5)
# delete_files('data/train/audio_yes_no/yes', 0.5)

In [22]:
model_name = "LSTM/custom_LSTM"


In [23]:
import torch
import torchaudio
import torchaudio.transforms as transforms

class MFCC(torch.nn.Module):
    def __init__(self, sample_rate=16000, n_mfcc=13, melkwargs=None):
        super(MFCC, self).__init__()
        self.sample_rate = sample_rate
        self.n_mfcc = n_mfcc
        self.melkwargs = melkwargs if melkwargs else {}

        self.mfcc_transform = transforms.MFCC(
            sample_rate=self.sample_rate,
            n_mfcc=self.n_mfcc,
            melkwargs=self.melkwargs
        )

    def forward(self, waveform):
        """
        Args:
            waveform (torch.Tensor): Tensor of audio of shape (channel, time)

        Returns:
            torch.Tensor: MFCC of audio of shape (channel, n_mfcc, time).
        """
        mfcc = self.mfcc_transform(waveform)
        # MFCC transformation yields (channel, n_mfcc, time) tensor
        return mfcc[0]

# Example Usage:
sample_rate = 16000
n_mfcc = 13
melkwargs = {'n_fft': 400, 'hop_length': 160, 'n_mels': 23}
feature_extractor = MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc, melkwargs=melkwargs)




In [24]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import os

class CustomAudioDataset(Dataset):
    def __init__(self, data_dir, transform=None, fixed_length=None):
        self.data_dir = data_dir
        self.file_list, self.labels = self._get_file_list_and_labels()
        self.transform = transform
        self.fixed_length = fixed_length

    def _get_file_list_and_labels(self):
        file_list = []
        labels = []
        for root, dirs, files in os.walk(self.data_dir):
            for file in files:
                if file.endswith(".wav"):  # Adjust file extension if needed
                    file_list.append(root + "/" + file)
                    labels.append(os.path.basename(root))  # Extract label from directory name
        return file_list, labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        
        
        
        if self.fixed_length:
            waveform = self._pad_waveform(waveform, self.fixed_length)

        label = self.labels[idx]

        

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, sample_rate, label

    def _pad_waveform(self, waveform, target_length):
        length_diff = target_length - waveform.size(1)
        if length_diff > 0:
            padding = torch.zeros((1, length_diff))
            waveform = torch.cat([waveform, padding], dim=1)
        return waveform

# Example usage
data_dir = "data/train/audio_small/"
# data_dir = "data/train/audio_yes_no/"
transform = feature_extractor  # You can define transformations if needed
fixed_length = 16000  # Assuming you want to fix the length to 16000 samples
sampling_rate = fixed_length  # Assuming you want to fix the sampling rate to 16000 Hz




batch_size = 4


data_loader = DataLoader(
    CustomAudioDataset(data_dir, transform=transform, fixed_length=fixed_length),
    batch_size=batch_size,
    shuffle=True
)

next(iter(data_loader))[0].shape

torch.Size([4, 13, 101])

In [25]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, use_conv=False):
        super(LSTMClassifier, self).__init__()
        self.use_conv = use_conv

        lstm_input_size = 101
        if use_conv:
            self.conv_layers = nn.Sequential(
                nn.Conv1d(in_channels=input_size, out_channels=32, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=2, stride=2),
                nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool1d(kernel_size=2, stride=2)
            )
            # Calculate the input size for the LSTM based on the output size of the last conv layer
            input_size = lstm_input_size//4

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        if self.use_conv:
            x = self.conv_layers(x)

        # print(x.shape)

        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the last output of the sequence
        out = self.fc(lstm_out)
        return out

def create_lstm_classifier(input_size, hidden_size, num_layers, num_classes, use_conv=False):
    model = LSTMClassifier(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes, use_conv=use_conv)
    return model

In [26]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def freeze_layers_except_last_n(model, n):
    # Get all parameters
    parameters = list(model.parameters())
    total_layers = len(parameters)

    # Freeze all layers except the last n
    for i, param in enumerate(parameters):
        if i < total_layers - n:
            param.requires_grad = False




In [27]:
num_epochs = 10
perc = 0.05
num_classes = 2 if data_dir == "data/train/audio_yes_no/" else 11

In [28]:
import random
import numpy as np

use_convs_ = [False, True]
num_layers_ = [2, 4, 6, 10]
hidden_sizes_ = [32, 64, 128]

for use_conv in tqdm(use_convs_, desc='use_conv loop'):
    for num_layers in tqdm(num_layers_, desc= 'num_layers loop'):
        for hidden_size in tqdm(hidden_sizes_, desc= 'hidden_sizes loop'):


            for i in tqdm(np.arange (0, 5, 1), desc='Training loop (5 times)'):
            
                random.seed(int(i))
                torch.manual_seed(i)
                torch.cuda.manual_seed(i)
                torch.cuda.manual_seed_all(i)

                
            
                model = create_lstm_classifier(input_size=13, hidden_size=128, num_layers=2, num_classes=num_classes, use_conv=True)

                only_name = model_name.split("/")[-1]   


                optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
                criterion = torch.nn.CrossEntropyLoss()

                device = torch.device('cuda')
                # Assuming your model is named ast_model
                

                model.to(device)

                train_dataset = CustomAudioDataset(data_dir, fixed_length=16000, transform=feature_extractor)
                                                


                n_train = len(train_dataset)
                n_val = int(perc * n_train)
                n_test = n_val//2
                n_train = n_train - n_val

                train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [n_train, n_val], generator=torch.Generator().manual_seed(42))
                val_dataset, test_dataset = torch.utils.data.random_split(val_dataset, [n_val-n_test, n_test], generator=torch.Generator().manual_seed(42))

                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
                test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
                

                labels = set(train_dataset.dataset.labels)
                label_to_index = dict((label, i) for i, label in enumerate(sorted(labels)))
                name = data_dir.split("/")[-2]
                # print(name)
                log_dir = train(model, train_loader, val_loader, num_epochs, optimizer, criterion, device, label_to_index, only_name, log=True, description=f"test_{name}_useConv_{str(use_conv)}_numLstmLayers_{num_layers}_hidden_sizes_{hidden_size}_{i}", lstm=True)
                test(model, test_loader, criterion, device, label_to_index, only_name, log_dir, lstm=True)



use_conv loop:   0%|          | 0/2 [00:00<?, ?it/s]

num_layers loop:   0%|          | 0/4 [00:00<?, ?it/s]

hidden_sizes loop:   0%|          | 0/3 [00:00<?, ?it/s]

Training loop (5 times):   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/10:   0%|          | 0/6187 [00:00<?, ?it/s]

Epoch 1/10, Train Loss: 1.263838587377262


Validation:   0%|          | 0/163 [00:00<?, ?it/s]

Epoch 1/10, Validation Loss: 0.826222360819761, Validation Accuracy: 0.7235023041474654


Epoch 2/10:   0%|          | 0/6187 [00:00<?, ?it/s]