### Set Seed

In [1]:
import random
import torch
import numpy as np

def set_seed(seed_value=42):
    """Set seed for reproducibility for PyTorch and NumPy.

    Args:
        seed_value (int): The seed value to set for random number generators.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    # Additional steps for deterministic behavior
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)  # You can replace 42 with any other seed value of your choice

### Create a Dataset class that returns the raw audio and labels (as tensors) in a single dictonary

In [24]:
from torch.utils.data import Dataset
import librosa
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

#import wav2vec model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

class CustomAudioDataset(Dataset):
    def __init__(self, csv_path, processor):
        self.dataframe = pd.read_csv(csv_path)
        self.processor = processor

        # Extract column names for labels dynamically
        self.label_columns = list(self.dataframe.columns[:-1])  #Exclude first two columns since these are irrelevant

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        print(f"Index: {idx}")
        audio_path = self.dataframe.iloc[idx]['mp3_path']
        
        # Select label columns based on the dynamically created list. This is grabbing all 188 class label names and converting to tensors.
        labels = torch.tensor(self.dataframe.iloc[idx][self.label_columns], dtype=torch.float32)
        
        # try:
        # Load raw audio data using librosa
        audio_data, _ = librosa.load(audio_path, sr=16000, mono=True, res_type="kaiser_fast")
        
        #Use processor to process audio file and return tensor of input values for model
        input_tensors = self.processor(audio_data, return_tensors="pt", sampling_rate=16000).input_values
        # except:
        #     print(f'Error loading audio file: {audio_path}')
        #     raise
        # Return a dictionary with input data and labels
        return {'labels': labels, 'embeddings': input_tensors}

csv_path_train = 'train_example.csv'
csv_path_val = 'valid_example.csv'
train_example = CustomAudioDataset(csv_path=csv_path_train, processor=processor)
val_example = CustomAudioDataset(csv_path=csv_path_val, processor=processor)
embedding_size = train_example[0]['embeddings'].shape[1] #make sure that we can easily modify embedding size input to model

print(f'We have {len(train_example)} training examples and {len(val_example)} validation examples.')
print(f'The size of our audio embeddings is {embedding_size}.')

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Index: 0
We have 50 training examples and 50 validation examples.
The size of our audio embeddings is 465984.


In [3]:
import librosa

# Replace 'path_to_your_audio_file.wav' with the path to one of your audio files
#audio_path = 'train/aba_structure-epic-01-deep_step-291-320.wav'
audio_path = 'train/aba_structure-epic-01-deep_step-320-349.wav'

try:
    audio_data, _ = librosa.load(audio_path, sr=16000)
    print("Successfully loaded audio file.")
except Exception as e:
    print(f"Error loading audio file: {e}")


Successfully loaded audio file.


In [4]:
#print the labels of the first five training examples

for i in range(5):
    print(f'Labels for training example {i}: {train_example[i]["labels"]}')
    

Labels for training example 0: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Labels for training example 1: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0

### Model Architecture. We first use the wav2vec autoencoder to generate audio embeddings. Then we add a few FF layers. Finally, we add a softmax layer for the output since we are doing multi-class classification.

In [9]:
# # #This is just boilerplate code. We can update this if we want to make it deeper etc.
# import torch.nn as nn

# class CustomAudioModel(nn.Module):
#     def __init__(self, dataset, ff_input_size=embedding_size, ff_output_size=64, num_classes=188):
#         super(CustomAudioModel, self).__init__()
        
#         self.dataset = dataset

#         # Define custom feed-forward layers
#         self.fc1 = nn.Linear(ff_input_size*len(self.dataset), ff_output_size)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(ff_output_size, num_classes)
#         #self.sigmoid = nn.Sigmoid()

#     def forward(self, embeddings, labels=None):
#         # Apply custom feed-forward layers directly to the input_values
#         #embeddings = torch.cat([batch['embeddings'] for batch in train_example], dim=1) #concatenate the embeddings
# #         embeddings = self.dataset['embeddings']
#         print("Input Shape:", embeddings.shape)
        
#         embeddings = embeddings.view(embeddings.size(0), -1) #flatten the embeddings
#         x = self.fc1(embeddings)
#         x = self.relu(x)
#         x = self.fc2(x)

#         if labels is not None:
#             # Calculate the loss if labels are provided
#             # Assuming you are using binary cross-entropy loss
#             loss_fn = nn.BCEWithLogitsLoss()
#             loss = loss_fn(x, labels)
#             return loss
#         else:
#             return x
        
# # class CustomAudioModel(nn.Module):
# #     def __init__(self, dataset, ff_input_size=embedding_size, ff_output_size=64, num_classes=188):
# #         super(CustomAudioModel, self).__init__()
        
# #         self.dataset = dataset

# #         # Define custom feed-forward layers
# #         self.fc1 = nn.Linear(ff_input_size, ff_output_size)
# #         self.relu = nn.ReLU()
# #         self.fc2 = nn.Linear(ff_output_size, num_classes)

# #     def forward(self, embeddings):
        
# #         embeddings = self.dataset['embeddings']  # Grab the audio embeddings from the inputs dictionary
# #         # Flatten the embeddings
# #         embeddings = embeddings.view(embeddings.size(0), -1)

# #         # Apply custom feed-forward layers directly to the flattened embeddings
# #         x = self.fc1(embeddings)
# #         x = self.relu(x)
# #         x = self.fc2(x)

# #         return x

# #     from transformers import Trainer, TrainingArguments
# # from torch.optim import Adam
# # import torch.nn.functional as F
# # #%pip install "transformers[torch]"

# # Instantiate the model
# #model = CustomAudioModel(ff_input_size=embedding_size, ff_output_size=64, num_classes=188) 

# # # #These need to be updated
# # # csv_path_train = 'data.csv'
# # # csv_path_val = 'data.csv'

# # # #Creating train and validate datasets
# # # train_dataset = CustomAudioDataset(csv_path_train, processor)
# # # val_dataset = CustomAudioDataset(csv_path_val, processor)

# # # Loss function for multi-label classification
# # # def compute_loss(model, inputs):
# # #     # Your custom loss calculation goes here
# # #     logits = model(inputs['input'])
# # #     loss = F.binary_cross_entropy_with_logits(logits, inputs['labels']) #appropiate loss function for multi-label classification
# # #     return loss

# # #Loss function for binary, multi-label classification

# # #loss_fn = nn.BCEWithLogitsLoss() #appropiate loss function for multi-label classification where each label is binary. 

# # # Optimizer
# # optimizer = Adam(model.parameters(), lr=.001)

# # Training arguments -- these need to be adjusted
# # training_args = TrainingArguments(
# #     output_dir='./results',                     # output directory
# #     num_train_epochs=3,                         # total number of training epochs
# #     per_device_train_batch_size=32,             # batch size per device during training
# #     per_device_eval_batch_size=32,              # batch size per device during eval
# #     #weight_decay=0.01,                         # regularization parameter
# #     logging_dir='./logs',                       # directory for storing logs
# #     logging_steps=10,                           # number of steps before logging
# #     evaluation_strategy="steps",                # evaluate every eval_steps
# #     eval_steps=50,                              # number of steps before evaluating
# #     save_total_limit=2,                         # limit the total amount of checkpoints. Deletes the older checkpoints.
# #     save_steps=500,                             # number of updates steps before checkpoint saves                       
# # )

# # # Trainer instance
# # trainer = Trainer(
# #     model=model,
# #     args=training_args,
# #     train_dataset=train_example,
# #     eval_dataset=val_example,
# #     #compute_loss=loss_fn,
# #     #optimizer=optimizer
# # )

# # # Train the model
# # trainer.train()


In [27]:
import torch.nn as nn

class CustomAudioModel(nn.Module):
    def __init__(self, input_size=768, hidden_size=64, num_classes=188):
        super(CustomAudioModel, self).__init__()

        self.fc = nn.Linear(465984, hidden_size)
        self.relu = nn.ReLU()
        self.fc_output = nn.Linear(hidden_size, num_classes)

    def forward(self, embeddings):

        x = self.fc(embeddings)
        x = self.relu(x)
        x = self.fc_output(x)
        return x

### Training module from Huggingface

In [10]:
# from transformers import Trainer, TrainingArguments
# from torch.optim import Adam
# import torch.nn.functional as F
# #%pip install "transformers[torch]"

# # Instantiate the model
# model = CustomAudioModel(dataset = train_example, ff_input_size=embedding_size, ff_output_size=64, num_classes=188) 

# # #These need to be updated
# # csv_path_train = 'data.csv'
# # csv_path_val = 'data.csv'

# # #Creating train and validate datasets
# # train_dataset = CustomAudioDataset(csv_path_train, processor)
# # val_dataset = CustomAudioDataset(csv_path_val, processor)

# # Loss function for multi-label classification
# # def compute_loss(model, inputs):
# #     # Your custom loss calculation goes here
# #     logits = model(inputs['input'])
# #     loss = F.binary_cross_entropy_with_logits(logits, inputs['labels']) #appropiate loss function for multi-label classification
# #     return loss

# #Loss function for binary, multi-label classification

# #loss_fn = nn.BCEWithLogitsLoss() #appropiate loss function for multi-label classification where each label is binary. 

# # Optimizer
# optimizer = Adam(model.parameters(), lr=.001)

# # Training arguments -- these need to be adjusted
# training_args = TrainingArguments(
#     output_dir='./results',                     # output directory
#     num_train_epochs=3,                         # total number of training epochs
#     per_device_train_batch_size=5,             # batch size per device during training
#     per_device_eval_batch_size=5,              # batch size per device during eval
#     #weight_decay=0.01,                         # regularization parameter
#     logging_dir='./logs',                       # directory for storing logs
#     logging_steps=10,                           # number of steps before logging
#     evaluation_strategy="steps",                # evaluate every eval_steps
#     eval_steps=50,                              # number of steps before evaluating
#     save_total_limit=2,                         # limit the total amount of checkpoints. Deletes the older checkpoints.
#     save_steps=500,                             # number of updates steps before checkpoint saves                       
# )

# # Trainer instance
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_example,
#     eval_dataset=val_example,
#     #compute_loss=loss_fn,
#     #optimizer=optimizer
# )

# # Train the model
# trainer.train()

# # Save the model after training
# model_path = "./example_50"
# model.save_pretrained(model_path)
# processor.save_pretrained(model_path)

Index: 38
Index: 33
Index: 41
Index: 22
Index: 36
Index: 30
Index: 26
Index: 4
Index: 25
Index: 13
Input Shape: torch.Size([5, 1, 465984])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (5x465984 and 23299200x64)

In [26]:
from transformers import Trainer, TrainingArguments
from torch.optim import Adam
import torch.nn.functional as F

# input_size = 768  # Update with the actual size of the embeddings
# hidden_size = 64  # Adjust based on your architecture
# num_classes = 188  # Adjust based on the number of classes

model = CustomAudioModel(input_size, hidden_size, num_classes)

class MyTrainer (Trainer):
    def compute_loss(self, model, inputs):
        print(inputs)
        logits = model(train_example['embeddings'])
        loss = F.binary_cross_entropy_with_logits(logits, train_example['labels'], reduction='mean')
        return loss

# Set up your training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit=2,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=2e-5,
)
# Instantiate the Trainer
trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=train_example,  # Assuming you have train_example defined
    eval_dataset=val_example,  # Assuming you have val_example defined
    data_collator=None,  # You can customize the data collator if needed
    compute_metrics=None, # You can define your own metrics function if needed
)

# Train the model
trainer.train()

# Save the model after training
model_path = "./example_model"
model.save_pretrained(model_path)

Index: 38
Index: 33
Index: 41
Index: 22
Index: 36
Index: 30
Index: 26
Index: 4
Index: 25
Index: 13
Index: 24
Index: 16
Index: 12
Index: 7
Index: 37
Index: 2
Index: 45
Index: 42
Index: 20
Index: 31
Index: 46
Index: 34
Index: 11
Index: 19
Index: 28
Index: 9
Index: 49
Index: 15
Index: 6
Index: 18
Index: 8
Index: 23
Index: 3
Index: 17
Index: 10
Index: 14
Index: 29
Index: 43
Index: 40
Index: 27
Index: 39
Index: 35
Index: 48
Index: 0
Index: 1
Index: 47
Index: 21
Index: 32
Index: 5
Index: 44
{'embeddings': tensor([[[-5.0209e-04, -5.0209e-04, -5.0209e-04,  ..., -5.0209e-04,
          -5.0209e-04, -5.0209e-04]],

        [[-1.5877e-04, -1.5877e-04, -1.5877e-04,  ..., -1.5877e-04,
          -1.5877e-04, -1.5877e-04]],

        [[-5.7970e-04, -5.7970e-04, -5.7970e-04,  ..., -5.7970e-04,
          -5.7970e-04, -5.7970e-04]],

        ...,

        [[-4.7962e-04, -4.7962e-04, -4.7962e-04,  ..., -4.7962e-04,
          -4.7962e-04, -4.7962e-04]],

        [[-4.2237e-06, -4.2237e-06, -4.2237e-06,  ...

TypeError: Cannot index by location index with a non-integer key

In [None]:
# class CustomAudioModel(nn.Module):
#     def __init__(self, wav2vec_model_name="facebook/wav2vec2-base-960h", output_size=188, ff_output_size=64):
#         super(CustomAudioModel, self).__init__()

#         # Load the Wav2Vec 2.0 model and processor
#         self.wav2vec_model = AutoModelForCTC.from_pretrained(wav2vec_model_name)
#         self.processor = AutoProcessor.from_pretrained(wav2vec_model_name)

#         # Define custom feed-forward layers
#         self.fc1 = nn.Linear(768, ff_output_size)  # Adjust input size based on Wav2Vec 2.0 model's hidden size
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(ff_output_size, output_size)  # Adjust output size based on your task
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, audio_data):
#         # Process audio data using the Wav2Vec 2.0 model
#         input_tensors = self.processor(audio_data, return_tensors="pt", sampling_rate=16000).input_values
#         with torch.no_grad():
#             embeddings = self.wav2vec_model(input_tensors).last_hidden_state.mean(dim=1)

#         # Apply custom feed-forward layers
#         x = self.fc1(embeddings)
#         x = self.relu(x)
#         x = self.fc2(x)
#         output = self.softmax(x)  # Apply sigmoid activation for multi-label classification

#         return output

#Instantiate the model
# model = CustomAudioModel()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# loss_fn = nn.CrossEntropyLoss()
# num_epochs = 3

# audio_data = torch.randn(1, 16000)  # Replace with your actual audio data
# output = model(audio_data)
# print(output.shape)  # This will be (batch_size, output_size), where output_size is 188 in your case


### Train the model

In [None]:
# for epoch in range(num_epochs):
#     for batch in data_loader:
#         inputs = batch['input']
#         labels = batch['labels']

#         # Zero the gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(inputs)

#         # Compute the loss
#         loss = loss_fn(outputs, labels)

#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()

#     print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')


#### Processing the train files

In [None]:
# import os
# from datasets import load_dataset
# from torch.utils.data import Dataset, DataLoader
# from transformers import AutoProcessor, AutoModelForCTC
# import torch
# import librosa
# import pandas as pd

# # Load your dataset from the CSV file
# csv_path = 'path/to/your/csv/file.csv'
# df = pd.read_csv(csv_path)

# # Load pretrained model and processor
# model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")

# class CustomAudioDataset(Dataset):
#     def __init__(self, dataframe, processor):
#         self.dataframe = dataframe
#         self.processor = processor

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         audio_path = self.dataframe.iloc[idx]['audio_file']
#         labels = self.dataframe.iloc[idx]['labels']

#         # Load audio file and process using the Wav2Vec processor
#         audio_data, _ = librosa.load(audio_path, sr=16000)
#         input_tensors = self.processor(audio_data, return_tensors="pt", sampling_rate=16000).input_values

#         return {'input': input_tensors, 'labels': torch.tensor(labels, dtype=torch.float32)}

# # Create an instance of your custom dataset
# audio_dataset = CustomAudioDataset(df, processor)

# # Create a PyTorch DataLoader for batching and shuffling
# batch_size = 32  # Adjust as needed
# data_loader = DataLoader(audio_dataset, batch_size=batch_size, shuffle=True)

# # Example usage in a training loop
# for batch in data_loader:
#     inputs = batch['input']
#     labels = batch['labels']

    # Forward pass, loss calculation, backward pass, optimization, etc.
    # Your training code goes here


In [None]:
# #Path to the directory containing the audio files
# train_audio = 'train'

# #list all files in the directory
# audio_files = [os.path.join(train_audio, file) for file in os.listdir(train_audio)]

# #Define number of files to process
# num_files = 10

# #iterate over audio file and extract embeddings.

# for i, audio_file in enumerate(audio_files):
#   embeddings = extract_audio_embeddings(audio_file)
  
#   #save embeddings in a numpy array
#   if i == 0:
#     embeddings_array = embeddings
#   else:
#     embeddings_array = np.vstack((embeddings_array, embeddings))
    
#   #Check if number of files to process has been reached
#   if i + 1 == num_files:
#     print(f'Processed {num_files} files. Stopping the iteration.')
#     break