### Set Seed

In [1]:
import random
import torch
import numpy as np

def set_seed(seed_value=42):
    """Set seed for reproducibility for PyTorch and NumPy.
s
    Args:
        seed_value (int): The seed value to set for random number generators.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    # Additional steps for deterministic behavior
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)  # You can replace 42 with any other seed value of your choice
print("done")

done


### Create a Dataset class that returns audio embeddings (using wav2vec) and labels (as tensors)

In [2]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import librosa
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

#import wav2vec model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

class CustomAudioDataset(Dataset):
    def __init__(self, csv_path, processor):
        self.dataframe = pd.read_csv(csv_path)
        self.processor = processor

        # Extract column names for labels dynamically
        self.label_columns = list(self.dataframe.columns[:-1])  #Exclude first two columns since these are irrelevant

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):

        # Get the audio path
        audio_path = self.dataframe.iloc[idx]['mp3_path']
        
        # Select label columns based on the dynamically created list. This is grabbing all 188 class label names and converting to tensors.
        #labels = self.dataframe.iloc[idx][self.label_columns]
        labels_array = self.dataframe.iloc[idx].loc[self.label_columns].astype('float').values
        labels = torch.tensor(labels_array, dtype=torch.float32)
        #labels = torch.tensor(self.dataframe.iloc[idx].loc[self.label_columns].astype('float').values, dtype=torch.float32)
        
        # Load raw audio data using librosa
        audio_data, _ = librosa.load(audio_path, sr=16000, mono=True, res_type="kaiser_fast")
        
        #Use processor to process audio file and return tensor of input values for model
        input_tensors = self.processor(audio_data, return_tensors="pt", sampling_rate=16000).input_values

        return input_tensors, labels
    
    # Define collate_fn to handle the varying lengths of audio files and labels. This will help with batching.
    def collate_fn(self, batch):
        input_tensors_batch, labels_batch = zip(*batch)
        return torch.stack(input_tensors_batch), torch.stack(labels_batch)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You sho

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


### Creating Custom Datasets for Train/Validate, Instantiating DataLoaders for Batching, Capturing Input_Size

In [3]:
#set the correct paths to the csv files
csv_path_train = 'train_example.csv'
csv_path_val = 'valid_example.csv'
csv_path_test = 'test_example.csv'

#initialize datasets
train_example = CustomAudioDataset(csv_path=csv_path_train, processor=processor)
val_example = CustomAudioDataset(csv_path=csv_path_val, processor=processor)
test_example = CustomAudioDataset(csv_path=csv_path_test, processor=processor)

#set the batch size for all dataloaders
batch_size = 32

#Dataloader for training
train_loader = DataLoader(train_example, batch_size=batch_size, shuffle=True, collate_fn=train_example.collate_fn)

#Dataloader for validation
val_loader = DataLoader(val_example, batch_size=batch_size, shuffle=False, collate_fn=val_example.collate_fn)

#Dataloader for testing
test_loader = DataLoader(test_example, batch_size=batch_size, shuffle=False, collate_fn=test_example.collate_fn)

#get the size of the audio embeddings
input_size = train_example[0][0].shape[1] #make sure that we can easily modify embedding size input to model

#Check the size of the datasets and the audio embeddings
print(f'We have {len(train_example)} training examples and {len(val_example)} validation examples.')
print(f'We have {len(train_loader)} batches in the training set and {len(val_loader)} batches in the validation set.')
print(f'The size of our audio embeddings is {input_size}.')

We have 50 training examples and 25 validation examples.
We have 2 batches in the training set and 1 batches in the validation set.
The size of our audio embeddings is 465984.


### Preparing batched training data for parameter grid_search

In [4]:
# Initialize empty lists to gather X_train and y_train
X_train = []
y_train = []

# Iterate over the DataLoader to gather all training data
for batch_input, batch_labels in train_loader:
    X_train.append(batch_input)
    y_train.append(batch_labels)

# Concatenate the lists to create tensors
X_train = torch.cat(X_train, dim=0)
y_train = torch.cat(y_train, dim=0)

# Convert PyTorch tensors to NumPy arrays
X_train_numpy = X_train.numpy()
y_train_numpy = y_train.numpy()

#check length of X_train and y_train

print(f'X_train has {len(X_train_numpy)} audio embeddings.')
print(f'y_train has {len(y_train_numpy)} labels and {y_train_numpy.shape[1]} classes.')

X_train has 50 audio embeddings.
y_train has 50 labels and 188 classes.


### Model Architecture: Custom FFNN with two layers, ReLU activation, Dropout, and Softmax layer

In [5]:
# Update if we want to make it deeper etc.
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

# class CustomAudioModel(nn.Module):
#     def __init__(self, input_size, output_size, num_classes, dropout_rate=0.5):
#         super(CustomAudioModel, self).__init__()

#         # Define custom feed-forward layers
#         self.fc1 = nn.Linear(input_size, output_size)
#         self.dropout = nn.Dropout(dropout_rate)
#         self.fc2 = nn.Linear(output_size, num_classes)
#         #self.softmax = nn.Softmax(dim=1)

#     def forward(self, embeddings):
#     # Apply custom feed-forward layers directly to the input_values
#         x = self.fc1(embeddings)
#         x = F.relu(x)
#         x = self.dropout(x)
#         x = self.fc2(x)
#         #x = self.softmax(x)
        
#         #remove extra dimension
#         x = x.squeeze(1)
        
#         return x

class CustomAudioModel(nn.Module):
    def __init__(self, input_size, num_classes, dropout_rate=0.5):
        super(CustomAudioModel, self).__init__()

        # Define convolutional layers
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1)

        # Define pooling layer
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        # Calculate the size of the flattened output after convolution and pooling
        conv_output_size = input_size // 8 * 128

        # Define fully connected layers
        self.fc1 = nn.Linear(conv_output_size, 512)
        self.fc2 = nn.Linear(512, num_classes)

        # Define dropout layer
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # Add a channel dimension to the input
        x = x.unsqueeze(1)

        # Apply convolutional and pooling layers
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))

        # Flatten the output for fully connected layers
        x = x.view(x.size(0), -1)

        # Apply fully connected layers with dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return x

### Addressing Class Imbalance in Train, Val, Test data by passing new weights into Loss Functions

In [6]:
# Function to generate class weights to be applied to the minority class ('1') in each label column to address class imbalance.

def class_weights(data_df):
    class_weights = []
    total_samples = len(data_df)
    
    for col in data_df.columns:
        class_counts = data_df[col].sum()
        if class_counts == 0:
            imbalance_ratio = 1.0 #if there are no positive samples, set imbalance ratio to 1
        else:
            imbalance_ratio = (total_samples - class_counts) / class_counts #calculate imbalance ratio. This is the ratio of negative to positive samples.
        class_weights.append(torch.tensor([1.0, imbalance_ratio], dtype=torch.float32)) #append to list of class weights
    class_weights_tensor = torch.tensor([weight[1] for weight in class_weights], dtype=torch.float32).unsqueeze(0) #extract the second value of each tensor in the list since this is the minority class weight. Then convert to tensor.
    return class_weights_tensor

#In training data, generate weights specific to label column distribution to apply to each of the 188 labels to account for class imbalance.
train_df = pd.read_csv('train_example.csv')
train_df = train_df.drop(['mp3_path'], axis=1)
train_weights_balanced = class_weights(train_df)
print(f"We have {(train_weights_balanced.shape[1])} labels in our training set. The updated weights for the minority label ('1') within each column are: ", train_weights_balanced)

#In validation data, generate weights specific to label column distribution to apply to each of the 188 labels to account for class imbalance.
val_df = pd.read_csv('valid_example.csv')
val_df = val_df.drop(['mp3_path'], axis=1)
val_weights_balanced = class_weights(val_df)
print(f"We have {(val_weights_balanced.shape[1])} labels in our validation set. The updated weights for the minority label ('1') within each column are: ", val_weights_balanced)

#In test data, generate weights specific to label column distribution to apply to each of the 188 labels to account for class imbalance.
test_df = pd.read_csv('test_example.csv')
test_df = test_df.drop(['mp3_path'], axis=1)
test_weights_balanced = class_weights(test_df)
print(f"We have {(test_weights_balanced.shape[1])} labels in our test set. The updated weights for the minority label ('1') within each column are: ", test_weights_balanced)


We have 188 labels in our training set. The updated weights for the minority label ('1') within each column are:  tensor([[15.6667,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000, 49.0000, 15.6667,  1.0000,
          1.0000,  1.0000,  6.1429,  1.0000,  1.0000,  1.0000, 49.0000, 49.0000,
          1.0000,  1.0000,  1.0000,  1.0000, 49.0000, 49.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000, 49.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000, 49.0000,  7.3333,  9.0000,  2.8462,
          1.0000, 49.0000,  1.0000,  1.0000, 49.0000, 15.6667,  1.0000,  1.0000,
         49.0000,  1.0000,  1.0000,  1.0000, 15.6667,  1.0000,  1.0000, 49.0000,
          1.0000, 49.0000,  1.0000,  1.0000,  1.0000,  1.0000,  4.5556,  1.5000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000, 11.5000,  1.0000,
          1.0000, 15.6667,  1.0000,  1.0000,  5.2500,  1.0000,  1.0000, 49.0

### Initialize Model, specify hyperparameters, train the model and print the loss

In [None]:
# # %pip install skorch
# # import skorch
# # from skorch import NeuralNetClassifier
# # from sklearn.model_selection import RandomizedSearchCV
# # from scipy.stats import loguniform
# from sklearn.model_selection import GridSearchCV

# #define hyperparameters grid or distribution

# num_classes = 188 #number of classes is 188
# weight_decay = .01 #weight decay is .01
# dropout_rate= 0.5 #dropout rate is 0.5
# hidden_size= 512 #hidden size is 512
# output_size= hidden_size//2 #output size is half of hidden size
# criterion_train = nn.BCEWithLogitsLoss(pos_weight=train_weights_balanced)  #use BCEWithLogitsLoss with class weights for training
# # optimizer = optim.Adam(model.parameters(), lr=.01, weight_decay=weight_decay) #use Adam optimizer with learning rate of .01, and L2 regularization with weight decay of 1e-5
# num_epochs = 5 

# param_grid = dict(hidden_size = [256,512])

# def create_model(lr, hidden_size, dropout_rate):
#     model = CustomAudioModel(
#         input_size=input_size,
#         hidden_size=hidden_size,
#         output_size=output_size,
#         num_classes=num_classes,
#         dropout_rate=dropout_rate
#     )
#     optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
#     return model, optimizer

# model = CustomAudioModel(
#     input_size=input_size,
#     hidden_size=hidden_size,
#     output_size=output_size,
#     num_classes=num_classes,
#     dropout_rate=dropout_rate
# )


# # #initialize model
# # model = CustomAudioModel(input_size, hidden_size=hidden_size, output_size=output_size, num_classes=num_classes, dropout_rate=dropout_rate)

# # grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
# # grid_result = grid.fit(X_train_numpy, y_train_numpy)
# # skorch_model = NeuralNetClassifier(
# #     CustomAudioModel,
# #     criterion=criterion_train,
# #     max_epochs=num_epochs,
# #     module__input_size=input_size,
# #     module__hidden_size=hidden_size,
# #     module__output_size=output_size,
# #     module__num_classes=num_classes,
# #     module__dropout_rate=dropout_rate,
# #     optimizer=optimizer,
# # )

# # param_dist = {
# #     'lr': loguniform(1e-4, 1e-1),
# #     'module__hidden_size': [256, 512],
# #     'module__dropout_rate': [0.3, 0.5, 0.7],
# # }

# # random_search = RandomizedSearchCV(
# #     skorch_model,
# #     param_distributions=param_dist,
# #     n_iter=10,
# #     scoring='average_precision',
# #     cv=3,
# #     n_jobs=-1,
# # )

# # # Fit the RandomizedSearchCV
# # random_search.fit(X_train_numpy, y_train_numpy)  # Provide your training data here

# # # Print the best parameters and corresponding score
# # print("Best Parameters: ", random_search.best_params_)
# # print("Best Score: ", random_search.best_score_)

In [None]:
# # Wrap the PyTorch model in a scikit-learn estimator
# from sklearn.base import is_classifier
# from sklearn.utils import check_X_y
# from sklearn.utils.validation import check_is_fitted
# from sklearn.base import BaseEstimator, ClassifierMixin

# class PyTorchEstimator(BaseEstimator, ClassifierMixin):
#     def __init__(self, model, criterion, optimizer, num_epochs):
#         self.model = model
#         self.criterion = criterion
#         self.optimizer = optimizer
#         self.num_epochs = num_epochs

#     def fit(self, X, y):
#         # Convert input data to torch.Tensor if not already
#         X, y = check_X_y(X, y, device='cuda' if torch.cuda.is_available() else 'cpu', dtype=torch.float32)
        
#         self.model, self.optimizer = create_model(
#             self.input_size, self.hidden_size, self.output_size, self.num_classes, self.dropout_rate
#         )
#         # Train the PyTorch model
#         for epoch in range(self.num_epochs):
#             # Your training loop here
#             pass

#     def predict(self, X):
#         # Implement prediction logic
#         pass

# # Create the PyTorchEstimator
# pytorch_estimator = PyTorchEstimator(model, criterion_train, optim.Adam(model.parameters()), num_epochs)

# # Create the GridSearchCV object
# grid_search = GridSearchCV(pytorch_estimator, param_grid, scoring='average_precision', cv=3, n_jobs=-1)

# # Fit the GridSearchCV
# grid_search.fit(X_train_numpy, y_train_numpy)

# # Print the best parameters and corresponding score
# print("Best Parameters: ", grid_search.best_params_)
# print("Best Score: ", grid_search.best_score_)

In [7]:
# hidden_size=512
# output_size=hidden_size//2 #this is the size of the hidden layer output from first feed-forward layer
num_classes=188 #this is the number of labels we have
dropout_rate=0.7 #this is the dropout rate for the dropout layer
criterion_train = nn.BCEWithLogitsLoss(pos_weight=train_weights_balanced) #use this loss function since we are doing multilabel classification. pos_weight is the weight to apply to the minority class ('1') in each label column to address class imbalance.
weight_decay = .01 #this is the L2 regularization weight decay
optimizer = optim.Adam(model.parameters(), lr=.01, weight_decay=weight_decay) #use Adam optimizer with learning rate of .01, and L2 regularization with weight decay of 1e-5
num_epochs = 5 

#initialize model
model = CustomAudioModel(input_size,num_classes=num_classes, dropout_rate=dropout_rate)

#training loop
for epoch in range(num_epochs):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        embeddings, labels = batch
        #embeddings, labels = embeddings.to(device), labels.to(device)
        embeddings = embeddings.squeeze(1)
        outputs = model(embeddings)
        # print("Input shape:", embeddings.shape)
        # print("Labels shape:", labels.shape)
        # print("Output shape:", outputs.shape)
        loss = criterion_train(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        # Print gradients only for the first batch in each epoch
        if batch_idx == 0:
            for name, param in model.named_parameters():
                if param.grad is not None:
                    print(f"Epoch {epoch+1}, Layer: {name}, Gradient mean: {param.grad.mean().item()}, Gradient std: {param.grad.std().item()}")


        optimizer.step()
        
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1, Layer: conv1.weight, Gradient mean: 4.923749656882137e-06, Gradient std: 0.00015924274339340627
Epoch 1, Layer: conv1.bias, Gradient mean: 4.543730756267905e-05, Gradient std: 0.00016900950868148357
Epoch 1, Layer: conv2.weight, Gradient mean: 3.380043926881626e-05, Gradient std: 0.00014743709471076727
Epoch 1, Layer: conv2.bias, Gradient mean: 8.952962525654584e-05, Gradient std: 0.00029948592418804765
Epoch 1, Layer: conv3.weight, Gradient mean: 2.2013147827237844e-05, Gradient std: 0.00012704973050858825
Epoch 1, Layer: conv3.bias, Gradient mean: 0.00017347076209262013, Gradient std: 0.0006632576696574688
Epoch 1, Layer: fc1.weight, Gradient mean: 3.6731373711518245e-06, Gradient std: 6.22087245574221e-05
Epoch 1, Layer: fc1.bias, Gradient mean: 8.619744767202064e-05, Gradient std: 0.00081253657117486
Epoch 1, Layer: fc2.weight, Gradient mean: 3.381079659448005e-05, Gradient std: 9.455285908188671e-05
Epoch 1, Layer: fc2.bias, Gradient mean: 0.0019013748969882727, Gradient 

### Validation Loop

In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix

model.eval()
criterion_val = nn.BCEWithLogitsLoss(pos_weight=val_weights_balanced) #use this loss function since we are doing multilabel classification. pos_weight is the weight to apply to the minority class ('1') in each label column to address class imbalance.
correct_predictions = 0
total_samples = 0

# for epoch in range(num_epochs):
#     val_loss = 0
#     with torch.no_grad():
#         for val_batch in val_loader:
#             embeddings_val, labels_val = val_batch
#             embeddings_val = embeddings_val.squeeze(1)
#             # Forward pass (no optimization in validation)
#             val_outputs = model(embeddings_val)

#             # Compute validation loss
#             loss = criterion_val(val_outputs, labels_val)
#             val_loss += loss.item()
            
#             #Convert probabilities to binary predictions using a threshold of 0.5
#             predictions = (torch.sigmoid(val_outputs) > 0.5).float()
            
#             #Collect predictions and true labels for precision, recall and F1 score
#             val_pred.append(predictions.cpu().numpy())
#             val_true.append(labels_val.cpu().numpy())
            
#             #Compute validation accuracy
#             correct_predictions += (predictions == labels_val).sum().item()
#             total_samples += labels_val.numel()
#             # Print validation loss after each epoch
#         print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {loss.item()}")

# # Calculate average validation loss
# # avg_val_loss = val_loss / len(val_loader)

# #Concatenate predictions and true labels along the sample axis
# val_pred = np.concatenate(val_pred, axis=0)
# val_true = np.concatenate(val_true, axis=0)

# #Calculate overall validation accuracy
# val_accuracy = round(((correct_predictions / total_samples)*100), 1)

# #Compute precision, recall and F1 score
# precision = precision_score(val_true, val_pred, average='micro') #micro is used to account for class imbalance
# recall = recall_score(val_true, val_pred, average='micro') #micro is used to account for class imbalance
# f1 = f1_score(val_true, val_pred, average='micro') #micro is used to account for class imbalance

# #compute confusion matrix

# conf_matrix_val = confusion_matrix(val_true.flatten(), val_pred.flatten())

# # Print training and validation metrics after each epoch
# print(f"Overall Validation Accuracy: {val_accuracy}%, Validation Precision: {precision}, Validation Recall: {recall}, Validation F1: {f1}")
# print("Confusion Matrix:")
# print(conf_matrix_val)
# print(f'The model has a true negative count of: {conf_matrix_val[0,0]}')
# print(f'The model has a false positive count of: {conf_matrix_val[0,1]}')
# print(f'The model has a true positive count of: {conf_matrix_val[1,1]}')
# print(f'The model has a false negative count of: {conf_matrix_val[1,0]}')

val_pred = []
val_true = []

for epoch in range(num_epochs):
    val_loss = 0
    with torch.no_grad():
        for val_batch in val_loader:
            embeddings_val, labels_val = val_batch
            embeddings_val = embeddings_val.squeeze(1)
            # Forward pass (no optimization in validation)
            val_outputs = model(embeddings_val)

            # Compute validation loss
            loss = criterion_val(val_outputs, labels_val)
            val_loss += loss.item()

            # Convert probabilities to binary predictions using a threshold of 0.5
            predictions = (torch.sigmoid(val_outputs) > 0.5).float()

            # Collect predictions and true labels for precision, recall, and F1 score
#             val_pred.append(predictions)
#             val_true.append(labels_val)
            np.append(val_pred, predictions)
            np.append(val_true, labels_val)

            # Compute validation accuracy
            correct_predictions += (predictions == labels_val).sum().item()
            total_samples += labels_val.numel()

        # Calculate average validation loss
        avg_val_loss = val_loss / len(val_loader)
        
        # Concatenate predictions and true labels along the sample axis
        print(val_pred)
        print(val_true)
        val_pred = np.concatenate(val_pred, axis=0)
        val_true = np.concatenate(val_true, axis=0)

        # Calculate overall validation accuracy
        # Note: Accuracy is not a common metric for multilabel classification, consider using precision, recall, and F1 score.
        
        # Compute precision, recall, and F1 score
        precision = precision_score(val_true, val_pred, average='micro')
        recall = recall_score(val_true, val_pred, average='micro')
        f1 = f1_score(val_true, val_pred, average='micro')

        # Compute confusion matrix
        conf_matrix_val = confusion_matrix(val_true.flatten(), val_pred.flatten())

        # Print training and validation metrics after each epoch
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}")
        print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix_val)
        print(f'The model has a true negative count of: {conf_matrix_val[0,0]}')
        print(f'The model has a false positive count of: {conf_matrix_val[0,1]}')
        print(f'The model has a true positive count of: {conf_matrix_val[1,1]}')
        print(f'The model has a false negative count of: {conf_matrix_val[1,0]}')

        
#Save model
model_path = "./model_50N"
torch.save(model.state_dict(), model_path)

[]
[]


ValueError: need at least one array to concatenate

### Test Loop

In [None]:
#load model
model.load_state_dict(torch.load(model_path))
model.eval()

criterion_test = nn.BCEWithLogitsLoss(pos_weight=test_weights_balanced) #use this loss function since we are doing multilabel classification. pos_weight is the weight to apply to the minority class ('1') in each label column to address class imbalance.
correct_predictions = 0
total_samples = 0
y_pred = []
y_true = []

with torch.no_grad():
    for epoch in range(num_epochs):
        test_loss = 0.0 #initialize validation loss
        for test_batch in test_loader:
            embeddings_test, labels_test = test_batch
            
            # Forward pass (no optimization in validation)
            test_outputs = model(embeddings_test)

            # Compute validation loss
            loss = criterion_test(test_outputs, labels_test)
            test_loss += loss.item()
            
            #Convert probabilities to binary predictions using a threshold of 0.5
            predictions = (torch.sigmoid(test_outputs) > 0.5).float()
            
            #Collect predictions and true labels for precision, recall and F1 score
            y_pred.append(predictions.cpu().numpy())
            y_true.append(labels_test.cpu().numpy())
            
            #Compute validation accuracy
            correct_predictions += (predictions == labels_test).sum().item()
            total_samples += labels_test.numel()
            #print test loss after each epoch
        print(f"Epoch {epoch + 1}/{num_epochs}, Test Loss: {loss.item()}")

# Calculate average validation loss
avg_test_loss = test_loss / len(test_loader)

#Concatenate predictions and true labels along the sample axis
y_pred = np.concatenate(y_pred, axis=0)
y_true = np.concatenate(y_true, axis=0)

#Calculate overall validation accuracy
test_accuracy = round(((correct_predictions / total_samples)*100), 1)

#Compute precision, recall and F1 score
precision = precision_score(y_true, y_pred, average='micro') #micro is used to account for class imbalance
recall = recall_score(y_true, y_pred, average='micro') #micro is used to account for class imbalance
f1 = f1_score(y_true, y_pred, average='micro') #micro is used to account for class imbalance

#compute confusion matrix
conf_matrix_test = confusion_matrix(y_true.flatten(), y_pred.flatten())
print("Confusion Matrix:")
print(conf_matrix_test)
print(f'The model has a true negative count of: {conf_matrix_test[0,0]}')
print(f'The model has a false positive count of: {conf_matrix_test[0,1]}')
print(f'The model has a true postive count of: {conf_matrix_test[1,1]}')
print(f'The model has a false negative count of: {conf_matrix_test[1,0]}')

# Print training and validation metrics after each epoch
print(f"Overall Test Accuracy: {test_accuracy}%, Test Precision: {precision}, Test Recall: {recall}, Test F1: {f1}")

#Save model
model_path = "./final_model"
torch.save(model.state_dict(), model_path)

In [None]:

# csv_path_test = 'test_example.csv'
# test_example = CustomAudioDataset(csv_path=csv_path_test, processor=processor)
# test_loader = DataLoader(test_example, batch_size=8, shuffle=False, collate_fn=test_example.collate_fn)

# #load model
# model.load_state_dict(torch.load(model_path))
# model.eval()

# #initialize lists to store labels and predictions

# y_true = []
# y_pred = []

# with torch.no_grad():
#     for batch in test_loader:
#         embeddings_test, labels_test = batch

#         # Forward pass (no optimization in validation)
#         test_outputs = model(embeddings_test)
        
#         # Compute test loss
#         test_loss = criterion(test_outputs, labels_test)

#         # Convert predictions to binary (0 or 1)
#         predicted_labels = (test_outputs > 0.5).float()

#         # Print predicted labels
#         all_predictions.append(predicted_labels.numpy())
#         print("Predicted Labels:")
#         print(predicted_labels.numpy())  

#         # Print true labels
#         all_labels.append(labels_test.numpy())
#         print("True Labels:")
#         print(labels_test.numpy())  

#         #get predictions
#         preds = torch.round(test_outputs)
        
#         #store predictions and labels for later use
#         y_true.extend(labels_test)
#         y_pred.extend(preds)

# #print loss and accuracy

# print(f"Test Loss: {test_loss.item()}")

### Training with Loop

In [None]:
# import torch
# import torch.optim as Adam
# import torch.nn as nn

# # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = AudioTaggingModel(input_size, 188)

# criterion = nn.BCELoss()
# optimizer = Adam.Adam(model.parameters(), lr=.001)

# num_epochs = 5 
# for epoch in range(num_epochs):
#     model.train()
#     for embeddings, labels in train_example:
# #         embeddings, labels = embeddings.to(device), labels.to(device)
#         outputs = model(embeddings).flatten()
#         loss = criterion(outputs, labels)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

### Model Evaluation with Validate Data

In [None]:
# model.eval()
# val_loss = 0.0
# with torch.no_grad():
#     for embeddings_val, labels_val in val_example:
# #         embeddings_val, labels_val = embeddings_val.to(device), labels_val.to(device)

#         # Forward pass (no optimization in validation)
#         outputs_val = model(embeddings_val).flatten()

#         # Compute validation loss
#         loss_val = criterion(outputs_val, labels_val)
#         val_loss += loss_val.item()

# model_path = "./model_path"
# torch.save(model.state_dict(), model_path)
# # Calculate average validation loss
# avg_val_loss = val_loss / len(val_example)

# # Print training and validation loss after each epoch
# print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {avg_val_loss}")

### Load and Test Model Performance on Test Data

In [None]:
# import torch
# import torch.nn as nn
# from torch.utils.data import DataLoader
# from sklearn.metrics import precision_score, recall_score, classification_report


# csv_path_test = 'test_example.csv'
# test_example = CustomAudioDataset(csv_path=csv_path_test, processor=processor)
# # Assuming you have test_loader defined


# # Assuming you have already trained and saved your model
# # If not, load your pre-trained model here
# model = AudioTaggingModel(465984, 188)
# model_path = "./model_path"
# model.load_state_dict(torch.load(model_path))
# model.eval()

# # Define your criterion (loss function)
# criterion = nn.BCELoss()

# # Test
# test_loss = 0.0
# all_predictions = []
# all_labels = []

# with torch.no_grad():
#     for embeddings_test, labels_test in test_example:
# #         inputs_test, labels_test = inputs_test.to(device), labels_test.to(device)

#         # Forward pass
#         outputs_test = model(embeddings_test).flatten()

#         # Compute test loss
#         loss_test = criterion(outputs_test, labels_test)
#         test_loss += loss_test.item()

#         # Convert predictions to binary (0 or 1)
#         predicted_labels = (outputs_test > 0.5).float()

#         # Print predicted labels
#         all_predictions.append(predicted_labels.numpy())
#         print("Predicted Labels:")
#         print(predicted_labels.numpy())  

#         # Print true labels
#         all_labels.append(labels_test.numpy())
#         print("True Labels:")
#         print(labels_test.numpy())  

# # Calculate average test loss
# avg_test_loss = test_loss / len(test_example)

# print(f"Test Loss: {avg_test_loss}")

# all_predictions = np.array(all_predictions)
# all_labels = np.array(all_labels)

# # Convert probabilities to binary predictions
# binary_preds = np.argmax(all_preds, axis=1)
# binary_labels = np.argmax(all_labels, axis=1)

# # Print classification report
# print("Classification Report:")
# print(classification_report(binary_labels, binary_preds))


### Model Graveyard

In [None]:
# import torch
# import torch.nn as nn
# class CustomAudioModel(nn.Module):
#     def __init__(self, ff_output_size, dataset, ff_input_size=input_size):
#         super(CustomAudioModel, self).__init__()
        
#         self.dataset = dataset

#         # Define custom feed-forward layers
#         self.fc1 = nn.Linear(ff_input_size*len(self.dataset), ff_output_size)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(ff_output_size, num_classes)
#         #self.sigmoid = nn.Sigmoid()

#     def forward(self, embeddings, labels=None):
#         # Apply custom feed-forward layers directly to the input_values
#         #embeddings = torch.cat([batch['embeddings'] for batch in train_example], dim=1) #concatenate the embeddings
# #         embeddings = self.dataset['embeddings']
#         print("Input Shape:", embeddings.shape)
        
#         embeddings = embeddings.view(embeddings.size(0), -1) #flatten the embeddings
#         x = self.fc1(embeddings)
#         x = self.relu(x)
#         x = self.fc2(x)

#         if labels is not None:
#             # Calculate the loss if labels are provided
#             # Assuming you are using binary cross-entropy loss
#             loss_fn = nn.BCEWithLogitsLoss()
#             loss = loss_fn(x, labels)
#             return loss
#         else:
#             return x
        
# import torch
# import torch.nn as nn

# class AudioTaggingModel(nn.Module):
#     def __init__(self, ff_embedding_size, ff_output_size):
#         super(AudioTaggingModel, self).__init__()
#         self.fc1 = nn.Linear(ff_embedding_size, 512)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(512, ff_output_size)
#         self.softmax = nn.Softmax(dim=1)
        
#     def forward(self, embeddings):
#         embeddings = self.fc1(embeddings)
#         embeddings = self.relu(embeddings)
#         embeddings = self.fc2(embeddings)
#         embeddings = self.softmax(embeddings)
#         return embeddings

# class AudioTaggingModel(nn.Module):
#     def __init__(self, ff_embedding_size, ff_output_size):
#         super(AudioTaggingModel, self).__init__()
#         self.fc1 = nn.Linear(ff_embedding_size, 512)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(512, ff_output_size)
#         self.softmax = nn.Softmax(dim=1)
        
#     def forward(self, embeddings):
#         embeddings = self.fc1(embeddings)
#         embeddings = self.relu(embeddings)
#         embeddings = self.fc2(embeddings)
#         embeddings = self.softmax(embeddings)
#         return embeddings

# import torch.nn as nn

# class CustomAudioModel(nn.Module):
#     def __init__(self, input_size=768, hidden_size=64, num_classes=188):
#         super(CustomAudioModel, self).__init__()

#         self.fc = nn.Linear(465984, hidden_size)
#         self.relu = nn.ReLU()
#         self.fc_output = nn.Linear(hidden_size, num_classes)

#     def forward(self, embeddings):

#         x = self.fc(embeddings)
#         x = self.relu(x)
#         x = self.fc_output(x)
#         return x

# # class CustomAudioModel(nn.Module):
# #     def __init__(self, dataset, ff_input_size=embedding_size, ff_output_size=64, num_classes=188):
# #         super(CustomAudioModel, self).__init__()
        
# #         self.dataset = dataset

# #         # Define custom feed-forward layers
# #         self.fc1 = nn.Linear(ff_input_size, ff_output_size)
# #         self.relu = nn.ReLU()
# #         self.fc2 = nn.Linear(ff_output_size, num_classes)

# #     def forward(self, embeddings):
        
# #         embeddings = self.dataset['embeddings']  # Grab the audio embeddings from the inputs dictionary
# #         # Flatten the embeddings
# #         embeddings = embeddings.view(embeddings.size(0), -1)

# #         # Apply custom feed-forward layers directly to the flattened embeddings
# #         x = self.fc1(embeddings)
# #         x = self.relu(x)
# #         x = self.fc2(x)

# #         return x

# #     from transformers import Trainer, TrainingArguments
# # from torch.optim import Adam
# # import torch.nn.functional as F
# # #%pip install "transformers[torch]"

# # Instantiate the model
# #model = CustomAudioModel(ff_input_size=embedding_size, ff_output_size=64, num_classes=188) 

# # # #These need to be updated
# # # csv_path_train = 'data.csv'
# # # csv_path_val = 'data.csv'

# # # #Creating train and validate datasets
# # # train_dataset = CustomAudioDataset(csv_path_train, processor)
# # # val_dataset = CustomAudioDataset(csv_path_val, processor)

# # # Loss function for multi-label classification
# # # def compute_loss(model, inputs):
# # #     # Your custom loss calculation goes here
# # #     logits = model(inputs['input'])
# # #     loss = F.binary_cross_entropy_with_logits(logits, inputs['labels']) #appropiate loss function for multi-label classification
# # #     return loss

# # #Loss function for binary, multi-label classification

# # #loss_fn = nn.BCEWithLogitsLoss() #appropiate loss function for multi-label classification where each label is binary. 

# # # Optimizer
# # optimizer = Adam(model.parameters(), lr=.001)

# # Training arguments -- these need to be adjusted
# # training_args = TrainingArguments(
# #     output_dir='./results',                     # output directory
# #     num_train_epochs=3,                         # total number of training epochs
# #     per_device_train_batch_size=32,             # batch size per device during training
# #     per_device_eval_batch_size=32,              # batch size per device during eval
# #     #weight_decay=0.01,                         # regularization parameter
# #     logging_dir='./logs',                       # directory for storing logs
# #     logging_steps=10,                           # number of steps before logging
# #     evaluation_strategy="steps",                # evaluate every eval_steps
# #     eval_steps=50,                              # number of steps before evaluating
# #     save_total_limit=2,                         # limit the total amount of checkpoints. Deletes the older checkpoints.
# #     save_steps=500,                             # number of updates steps before checkpoint saves                       
# # )

# # # Trainer instance
# # trainer = Trainer(
# #     model=model,
# #     args=training_args,
# #     train_dataset=train_example,
# #     eval_dataset=val_example,
# #     #compute_loss=loss_fn,
# #     #optimizer=optimizer
# # )

# # # Train the model
# # trainer.train()

In [None]:
# import librosa

# # Replace 'path_to_your_audio_file.wav' with the path to one of your audio files
# #audio_path = 'train/aba_structure-epic-01-deep_step-291-320.wav'
# audio_path = 'train/aba_structure-epic-01-deep_step-320-349.wav'

# try:
#     audio_data, _ = librosa.load(audio_path, sr=16000)
#     print("Successfully loaded audio file.")
# except Exception as e:
#     print(f"Error loading audio file: {e}")


In [None]:
# from transformers import Trainer, TrainingArguments
# from torch.optim import Adam
# import torch.nn.functional as F
# #%pip install "transformers[torch]"

# # Instantiate the model
# # model = CustomAudioModel(dataset = train_example, ff_input_size=embedding_size, ff_output_size=64, num_classes=188) 
# model = AudioTaggingModel(465984, 188)
# # #These need to be updated
# # csv_path_train = 'data.csv'
# # csv_path_val = 'data.csv'

# # #Creating train and validate datasets
# # train_dataset = CustomAudioDataset(csv_path_train, processor)
# # val_dataset = CustomAudioDataset(csv_path_val, processor)

# # Loss function for multi-label classification
# # def compute_loss(model, inputs):
# #     # Your custom loss calculation goes here
# #     logits = model(inputs['input'])
# #     loss = F.binary_cross_entropy_with_logits(logits, inputs['labels']) #appropiate loss function for multi-label classification
# #     return loss

# #Loss function for binary, multi-label classification
# class CustomTrainer(Trainer):
#     loss_fn = nn.BCEWithLogitsLoss() #appropiate loss function for multi-label classification where each label is binary. 

#     # Optimizer
#     optimizer = Adam(model.parameters(), lr=.001)

# # Training arguments -- these need to be adjusted
# training_args = TrainingArguments(
#     output_dir='./results',                     # output directory
#     num_train_epochs=3,                         # total number of training epochs
#     per_device_train_batch_size=5,             # batch size per device during training
#     per_device_eval_batch_size=5,              # batch size per device during eval
#     #weight_decay=0.01,                         # regularization parameter
#     logging_dir='./logs',                       # directory for storing logs
#     logging_steps=10,                           # number of steps before logging
#     evaluation_strategy="steps",                # evaluate every eval_steps
#     eval_steps=50,                              # number of steps before evaluating
#     save_total_limit=2,                         # limit the total amount of checkpoints. Deletes the older checkpoints.
#     save_steps=500,                             # number of updates steps before checkpoint saves                       
# )

# # Trainer instance
# trainer = CustomTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_example,
#     eval_dataset=val_example,
# #     compute_loss=loss_fn,
#     #optimizer=optimizer
# )

# # Train the model
# trainer.train()

# # Save the model after training
# model_path = "./example_50"
# model.save_pretrained(model_path)
# processor.save_pretrained(model_path)

In [None]:
# from transformers import Trainer, TrainingArguments
# from torch.optim import Adam
# import torch.nn.functional as F

# # input_size = 768  # Update with the actual size of the embeddings
# # hidden_size = 64  # Adjust based on your architecture
# # num_classes = 188  # Adjust based on the number of classes

# # model = AudioTaggingModel(dataset = train_example, ff_input_size=embedding_size, ff_output_size=64, num_classes=188) 

# model = AudioTaggingModel(465984, 188)

# # Set up your training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     save_total_limit=2,
#     save_steps=500,
#     evaluation_strategy="steps",
#     eval_steps=100,
#     learning_rate=2e-5,
# )
# # Instantiate the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_example,  # Assuming you have train_example defined
#     eval_dataset=val_example,  # Assuming you have val_example defined
#     data_collator=None,  # You can customize the data collator if needed
#     compute_metrics=None, # You can define your own metrics function if needed
# )

# # Train the model
# trainer.train()

# # Save the model after training
# model_path = "./example_model"
# model.save_pretrained(model_path)

In [None]:
# class CustomAudioModel(nn.Module):
#     def __init__(self, wav2vec_model_name="facebook/wav2vec2-base-960h", output_size=188, ff_output_size=64):
#         super(CustomAudioModel, self).__init__()

#         # Load the Wav2Vec 2.0 model and processor
#         self.wav2vec_model = AutoModelForCTC.from_pretrained(wav2vec_model_name)
#         self.processor = AutoProcessor.from_pretrained(wav2vec_model_name)

#         # Define custom feed-forward layers
#         self.fc1 = nn.Linear(768, ff_output_size)  # Adjust input size based on Wav2Vec 2.0 model's hidden size
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(ff_output_size, output_size)  # Adjust output size based on your task
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, audio_data):
#         # Process audio data using the Wav2Vec 2.0 model
#         input_tensors = self.processor(audio_data, return_tensors="pt", sampling_rate=16000).input_values
#         with torch.no_grad():
#             embeddings = self.wav2vec_model(input_tensors).last_hidden_state.mean(dim=1)

#         # Apply custom feed-forward layers
#         x = self.fc1(embeddings)
#         x = self.relu(x)
#         x = self.fc2(x)
#         output = self.softmax(x)  # Apply sigmoid activation for multi-label classification

#         return output

#Instantiate the model
# model = CustomAudioModel()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# loss_fn = nn.CrossEntropyLoss()
# num_epochs = 3

# audio_data = torch.randn(1, 16000)  # Replace with your actual audio data
# output = model(audio_data)
# print(output.shape)  # This will be (batch_size, output_size), where output_size is 188 in your case


### Train the model

In [None]:
# for epoch in range(num_epochs):
#     for batch in data_loader:
#         inputs = batch['input']
#         labels = batch['labels']

#         # Zero the gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(inputs)

#         # Compute the loss
#         loss = loss_fn(outputs, labels)

#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()

#     print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')


#### Processing the train files

In [None]:
# import os
# from datasets import load_dataset
# from torch.utils.data import Dataset, DataLoader
# from transformers import AutoProcessor, AutoModelForCTC
# import torch
# import librosa
# import pandas as pd

# # Load your dataset from the CSV file
# csv_path = 'path/to/your/csv/file.csv'
# df = pd.read_csv(csv_path)

# # Load pretrained model and processor
# model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")

# class CustomAudioDataset(Dataset):
#     def __init__(self, dataframe, processor):
#         self.dataframe = dataframe
#         self.processor = processor

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         audio_path = self.dataframe.iloc[idx]['audio_file']
#         labels = self.dataframe.iloc[idx]['labels']

#         # Load audio file and process using the Wav2Vec processor
#         audio_data, _ = librosa.load(audio_path, sr=16000)
#         input_tensors = self.processor(audio_data, return_tensors="pt", sampling_rate=16000).input_values

#         return {'input': input_tensors, 'labels': torch.tensor(labels, dtype=torch.float32)}

# # Create an instance of your custom dataset
# audio_dataset = CustomAudioDataset(df, processor)

# # Create a PyTorch DataLoader for batching and shuffling
# batch_size = 32  # Adjust as needed
# data_loader = DataLoader(audio_dataset, batch_size=batch_size, shuffle=True)

# # Example usage in a training loop
# for batch in data_loader:
#     inputs = batch['input']
#     labels = batch['labels']

    # Forward pass, loss calculation, backward pass, optimization, etc.
    # Your training code goes here


In [None]:
# #Path to the directory containing the audio files
# train_audio = 'train'

# #list all files in the directory
# audio_files = [os.path.join(train_audio, file) for file in os.listdir(train_audio)]

# #Define number of files to process
# num_files = 10

# #iterate over audio file and extract embeddings.

# for i, audio_file in enumerate(audio_files):
#   embeddings = extract_audio_embeddings(audio_file)
  
#   #save embeddings in a numpy array
#   if i == 0:
#     embeddings_array = embeddings
#   else:
#     embeddings_array = np.vstack((embeddings_array, embeddings))
    
#   #Check if number of files to process has been reached
#   if i + 1 == num_files:
#     print(f'Processed {num_files} files. Stopping the iteration.')
#     break