<a href="https://colab.research.google.com/github/archit436/Birds_Classifier/blob/main/Models/Main_Models/Main_Model_Audio_Main_File.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


0. Setup


In [2]:
# Start by importing the relevant libraries.
# Copied from Archit's Lab 3 Submission and then some more.
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torchvision.transforms as transforms
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor

  from .autonotebook import tqdm as notebook_tqdm


1. Data Processing - WAV Tensors


In [3]:
# Start by importing the tensors stored in the .pt files, one for each class.
# Define the directory of the audio data.
wav_tensors_dir = '../../Data/Xeno_Canto_WAV_Tensors'
# Define a list to store the tensors and associated labels.
all_features = []
all_labels = []

# Iterate through each bird.
for file in os.listdir(wav_tensors_dir):
    # Check if the file is a .pt file.
    if(file.endswith('.pt') == False):
        continue
    # Load the pt file.
    pt_file = glob.glob(os.path.join(wav_tensors_dir, file))
    # Error check
    if(pt_file == []):
        continue
    # Extract features and create labels in tensors.
    features_tensor = torch.load(pt_file[0])
    label_value = int(os.path.splitext(file)[0])
    labels_tensor = torch.full((features_tensor.shape[0],), label_value, dtype=torch.long)
    # Add these tensors to their respective lists.
    all_features.append(features_tensor)
    all_labels.append(labels_tensor)

  features_tensor = torch.load(pt_file[0])


In [3]:
# Print the number of classes for which data has been extracted.
num_classes = len(all_labels)
print(f"Number of classes for which data has been extracted: {num_classes}")

Number of classes for which data has been extracted: 106


In [4]:
# Error check for type of tensors.
all_features[0].dtype

torch.float32

In [5]:
# Concatenate all the tensors into one tensor.
features_tensor = torch.cat(all_features, dim=0)
labels_tensor = torch.cat(all_labels, dim=0)

# Encode the labels to make them suitable for training the model.
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels_tensor)
# Create mapping dictionaries for the encoding.
id_to_index = dict(zip(labels_tensor, encoded_labels))
index_to_id = dict(zip(encoded_labels, labels_tensor))
# Pickle dump these mappings for use later.
with open('label_mappings.pkl', 'wb') as f:
    pickle.dump({'id_to_index': id_to_index, 'index_to_id': index_to_id}, f)

# Replace the labels tensor.
labels_tensor = torch.from_numpy(encoded_labels)

# Print out stats.
print(f"Shape of features tensor: {features_tensor.shape}")
print(f"Shape of labels tensor: {labels_tensor.shape}")

Shape of features tensor: torch.Size([19675, 320000])
Shape of labels tensor: torch.Size([19675])


In [6]:
# Now we move on to splitting the data into training, validation, and test sets.
# We will use a stratified split to ensure uniform distribution of classes.

# Get labels as numpy array.
labels_np = labels_tensor.numpy()
# Use the labels np array to create indices array.
indices = np.arange(len(labels_np))

# First Split: train + val vs test - 80:20
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
temp_idx, test_idx = next(sss1.split(indices, labels_np))

# Get the temporary set.
temp_indices = np.arange(len(temp_idx))
temp_labels_np = labels_np[temp_idx]

# Second Split: train vs val - 80:20
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
train_temp_idx, val_temp_idx = next(sss2.split(temp_indices, temp_labels_np))

# Convert to original indices.
train_idx = temp_idx[train_temp_idx]
val_idx = temp_idx[val_temp_idx]

In [7]:
# Create a full dataset combining the features and labels tensors.
full_dataset = TensorDataset(features_tensor, labels_tensor)

# Create subset datasets using the split indices.
train_dataset = torch.utils.data.Subset(full_dataset, train_idx)
val_dataset = torch.utils.data.Subset(full_dataset, val_idx)
test_dataset = torch.utils.data.Subset(full_dataset, test_idx)

2. Wav2Vec as Feature Extractor & CNN Implementation


In [4]:
"""FOR MACBOOK LOCAL SETUP USERS ONLY """
use_mps = True
# Set device to GPU if available
device = torch.device("mps") if use_mps and torch.backends.mps.is_available() else torch.device("cpu")

In [None]:
# First we start by setting up Wav2Vec to extract features from the audio data.
# Define a custom dataset wrapper to incorporate wav2vec feature extraction.
class Wav2VecFeatureDataset(Dataset):
    def __init__(self, original_dataset, cache_features=True):
        self.original_dataset = original_dataset
        self.cache_features = cache_features
        self.cached_features = {} if cache_features else None
        
        # Initialize wav2vec model and feature extractor
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
        self.wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
        self.wav2vec_model.to(device) # Shift to GPU if available.
        self.wav2vec_model.eval()  # Set to evaluation mode since we're just extracting features
    
    def __len__(self):
        return len(self.original_dataset)
    
    def __getitem__(self, idx):
        # Get original data
        audio, label = self.original_dataset[idx]
        
        # Check if features are already cached
        if self.cache_features and idx in self.cached_features:
            features = self.cached_features[idx]
        else:
            # Process audio through wav2vec
            with torch.no_grad():
                # Prepare inputs for wav2vec
                inputs = self.feature_extractor(
                    audio.cpu().numpy() if isinstance(audio, torch.Tensor) else audio, 
                    sampling_rate=16000, 
                    return_tensors="pt"
                )
                
                # Shift to GPU if available, same device as model.
                inputs = {key: value.to(device) for key, value in inputs.items()}

                # Extract features
                outputs = self.wav2vec_model(**inputs)
                features = outputs.last_hidden_state.squeeze(0)  # Remove batch dimension
            
            # Cache if enabled
            if self.cache_features:
                self.cached_features[idx] = features
        
        return features, label

In [6]:
# Create DataLoaders with wav2vec feature extraction
def create_feature_dataloaders(train_dataset, val_dataset, test_dataset, batch_size):
    # Wrap the original datasets with wav2vec feature extraction
    train_feature_dataset = Wav2VecFeatureDataset(train_dataset)
    val_feature_dataset = Wav2VecFeatureDataset(val_dataset)
    test_feature_dataset = Wav2VecFeatureDataset(test_dataset)
    
    # Create DataLoaders
    train_loader = DataLoader(
        train_feature_dataset, 
        batch_size=batch_size,
        shuffle=True,
        num_workers=4
    )
    
    val_loader = DataLoader(
        val_feature_dataset, 
        batch_size=batch_size,
        shuffle=False,
        num_workers=4
    )
    
    test_loader = DataLoader(
        test_feature_dataset, 
        batch_size=batch_size,
        shuffle=False,
        num_workers=4
    )
    
    return train_loader, val_loader, test_loader

In [None]:
# Create data loaders using the feature extraction wrapper.
batch_size = 32
train_loader, val_loader, test_loader = create_feature_dataloaders(
    train_dataset, val_dataset, test_dataset, batch_size)