In [1]:
from src.utils import *

In [2]:
# delete 75% of files in the directory
import os
import random

def delete_files(directory, percentage=0.75):
    files = os.listdir(directory)
    files = random.sample(files, int(len(files) * percentage))
    for file in files:
        os.remove(os.path.join(directory, file))

# delete_files('data/train/audio_yes_no/no', 0.5)
# delete_files('data/train/audio_yes_no/yes', 0.5)

In [3]:
import torch
from transformers import AutoFeatureExtractor, ASTForAudioClassification


model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = ASTForAudioClassification.from_pretrained(model_name)

only_name = model_name.split("/")[-1]   


In [4]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import os

class CustomAudioDataset(Dataset):
    def __init__(self, data_dir, transform=None, fixed_length=None):
        self.data_dir = data_dir
        self.file_list, self.labels = self._get_file_list_and_labels()
        self.transform = transform
        self.fixed_length = fixed_length

    def _get_file_list_and_labels(self):
        file_list = []
        labels = []
        for root, dirs, files in os.walk(self.data_dir):
            for file in files:
                if file.endswith(".wav"):  # Adjust file extension if needed
                    file_list.append(root + "/" + file)
                    labels.append(os.path.basename(root))  # Extract label from directory name
        return file_list, labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        
        
        
        if self.fixed_length:
            waveform = self._pad_waveform(waveform, self.fixed_length)

        label = self.labels[idx]

        

        if self.transform:
            waveform = self.transform(waveform, sampling_rate=sample_rate)['input_values'][0]

        return waveform, sample_rate, label

    def _pad_waveform(self, waveform, target_length):
        length_diff = target_length - waveform.size(1)
        if length_diff > 0:
            padding = torch.zeros((1, length_diff))
            waveform = torch.cat([waveform, padding], dim=1)
        return waveform.squeeze(0)

# Example usage
data_dir = "data/train/audio_yes_no/"
transform = feature_extractor  # You can define transformations if needed
fixed_length = 16000  # Assuming you want to fix the length to 16000 samples
sampling_rate = fixed_length  # Assuming you want to fix the sampling rate to 16000 Hz

# Create custom dataset
dataset = CustomAudioDataset(data_dir, transform=transform, fixed_length=fixed_length)

batch_size = 4


In [5]:
num_classes = 11
num_ftrs = 768

In [6]:
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

In [7]:
device = torch.device('cuda')
device

device(type='cuda')

In [8]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Assuming your model is named 'model'
trainable_params = count_trainable_parameters(model)
print("Number of trainable parameters:", trainable_params)

Number of trainable parameters: 86594063


In [9]:
def freeze_layers_except_last_n(model, n):
    # Get all parameters
    parameters = list(model.parameters())
    total_layers = len(parameters)

    # Freeze all layers except the last n
    for i, param in enumerate(parameters):
        if i < total_layers - n:
            param.requires_grad = False

# Assuming your model is named ast_model
freeze_layers_except_last_n(model.audio_spectrogram_transformer.encoder.layer, 4)

model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (de

In [10]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Assuming your model is named 'model'
trainable_params = count_trainable_parameters(model)
print("Number of trainable parameters:", trainable_params)


Number of trainable parameters: 1542671


In [11]:
train_dataset = CustomAudioDataset(data_dir, fixed_length=16000, transform=feature_extractor)
                                    
perc = 0.05

n_train = len(train_dataset)
n_val = int(perc * n_train)
n_test = n_val//2
# n_val = n_val - n_test
n_train = n_train - n_val


print(n_train, n_val, n_test)

1130 59 29


In [12]:
len(train_dataset)

1189

In [13]:
n_val-n_test, n_test

(30, 29)

In [14]:
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [n_train, n_val])
val_dataset, test_dataset = torch.utils.data.random_split(val_dataset, [n_val-n_test, n_test])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
labels = set(train_dataset.dataset.labels)
label_to_index = dict((label, i) for i, label in enumerate(sorted(labels)))
label_to_index

{'no': 0, 'yes': 1}

In [16]:
def train(model, train_loader, val_loader, num_epochs, optimizer, criterion, device, label_to_index, only_name, log = True, description = ""):
    losses = []
    accuracies = []
    model = model.to(device)
    train_losses = []
    for i, epoch in enumerate(range(num_epochs)):
        epoch_losses = []
        for waveforms, sr, labels in tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}"):
            
            optimizer.zero_grad()
            waveforms = waveforms.to(device)
            outputs = model(waveforms.squeeze(1))

            # Convert string labels to integer indices
            target_indices = [label_to_index[label] for label in labels]

            # Convert the list of indices to a tensor
            target_tensor = torch.tensor(target_indices)

            # print(outputs['logits'])

            loss = criterion(outputs['logits'], target_tensor.to(device))
            epoch_losses.append(loss.item())
            loss.backward()
            optimizer.step()
        

        loss = sum(epoch_losses)/len(epoch_losses)
        train_losses.append(loss)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {loss}")
        torch.cuda.empty_cache()

        
        # predict on validation set
        val_losses = []
        val_accuracies = []
        for waveforms, sr, labels in tqdm(val_loader, total=len(val_loader), desc=f"Validation"):
            waveforms = waveforms.to(device)
            outputs = model(waveforms.squeeze(1))
            target_indices = [label_to_index[label] for label in labels]
            target_tensor = torch.tensor(target_indices)
            
            loss = criterion(outputs['logits'], target_tensor.to(device))
            # print(loss)
            val_losses.append(loss.item())
            val_accuracies.append((outputs['logits'].argmax(1) == target_tensor.to(device)).float())
        # break
        # print(val_losses)
        # print(val_accuracies)
        val_losses = [l for l in val_losses]
        # print(val_accuracies)
        val_len = [len(a) for a in val_accuracies]
        val_accuracies = [a.sum().item() for a in val_accuracies]
        
        # print(val_accuracies, val_len)
        

        loss = sum(val_losses)/len(val_losses)
        accuracy = sum(val_accuracies)/sum(val_len)

        losses.append(loss)
        accuracies.append(accuracy)

        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {loss}, Validation Accuracy: {accuracy}")
        torch.cuda.empty_cache()



    if log:
        date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir = f"logs/{only_name}-{date}"
        if description != "":
            log_dir += f"_{description}"
        os.makedirs(log_dir)
        # save acc and loss
        with open(f"{log_dir}/val_acc.txt", "w") as f:
            f.write(str(accuracies))
        with open(f"{log_dir}/val_loss.txt", "w") as f:
            f.write(str(losses))
        with open(f"{log_dir}/train_loss.txt", "w") as f:
            f.write(str(train_losses))

        return log_dir

In [18]:
log_dir = train(model, train_loader, val_loader, num_epochs, optimizer, criterion, device, label_to_index, only_name, log=True, description="testYesNoSmall_0")

Epoch 1/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 1/10, Train Loss: 0.3843248724039025


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1/10, Validation Loss: 0.39477044773229863, Validation Accuracy: 0.8666666666666667


Epoch 2/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 2/10, Train Loss: 0.15384972734997326


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 2/10, Validation Loss: 0.01688660568834166, Validation Accuracy: 1.0


Epoch 3/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 3/10, Train Loss: 0.07228399301972332


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 3/10, Validation Loss: 0.023325057642068714, Validation Accuracy: 1.0


Epoch 4/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 4/10, Train Loss: 0.0686828200041836


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 4/10, Validation Loss: 0.050660979313761345, Validation Accuracy: 0.9666666666666667


Epoch 5/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 5/10, Train Loss: 0.04098724722319375


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 5/10, Validation Loss: 0.08141579715083935, Validation Accuracy: 0.9666666666666667


Epoch 6/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 6/10, Train Loss: 0.04851578006364659


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 6/10, Validation Loss: 0.016466778015228556, Validation Accuracy: 1.0


Epoch 7/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 7/10, Train Loss: 0.07458274154657729


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 7/10, Validation Loss: 0.08525022688263562, Validation Accuracy: 0.9666666666666667


Epoch 8/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 8/10, Train Loss: 0.07308713848095709


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 8/10, Validation Loss: 0.08335635418563925, Validation Accuracy: 0.9666666666666667


Epoch 9/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 9/10, Train Loss: 0.0565040046773039


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 9/10, Validation Loss: 0.012776518699240569, Validation Accuracy: 1.0


Epoch 10/10:   0%|          | 0/283 [00:00<?, ?it/s]

Epoch 10/10, Train Loss: 0.05822439122068698


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 10/10, Validation Loss: 0.009345623905517186, Validation Accuracy: 1.0


In [20]:
def test(model, test_loader, criterion, device, label_to_index, only_name, log_dir, description = "", log = True):
    model.eval()
    losses = []
    accuracies = []
    for waveforms, sr, labels in tqdm(test_loader, total=len(test_loader), desc="Testing"):
        waveforms = waveforms.to(device)
        outputs = model(waveforms.squeeze(1))
        target_indices = [label_to_index[label] for label in labels]
        target_tensor = torch.tensor(target_indices)
        loss = criterion(outputs['logits'], target_tensor.to(device))
        losses.append(loss.item())
        accuracies.append((outputs['logits'].argmax(1) == target_tensor.to(device)).float())
    
    lens = [len(a) for a in accuracies]
    accuracies = [a.sum().item() for a in accuracies]
    
    # print(val_accuracies, val_len)
    

    loss = sum(losses)/len(losses)
    accuracy = sum(accuracies)/sum(lens)

    if log:
        if description != "":
            log_dir += f"_{description}"
       
        try:    
            os.makedirs(log_dir)
        except:
            print("Directory already exists")
        # save acc and loss
        with open(f"{log_dir}/test_acc.txt", "w") as f:
            f.write(str(accuracies))
        with open(f"{log_dir}/test_loss.txt", "w") as f:
            f.write(str(losses))


    print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")
    return accuracy

test(model, test_loader, criterion, device, label_to_index, only_name, log_dir)

Testing:   0%|          | 0/8 [00:00<?, ?it/s]

Directory already exists
Test Loss: 0.5217548457974317, Test Accuracy: 0.9310344827586207


0.9310344827586207