In [10]:
# Move imports to utility
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, gc, random 
from pathlib import Path
from tqdm.notebook import tqdm
import IPython.display as ipd
from IPython.display import display, clear_output
import ipywidgets as widgets

import librosa
import librosa.display
import soundfile as sf

import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

NOTE
- See if imports, along with preprocessingg and augmentation funcs can easily be moved to the utility folder


In [11]:
class Config:
    SEED = 42
    SAMPLE_RATE = 32000
    DATA_PATH = Path("/kaggle/input/birdclef-2025")
    # Spectrogram Params
    N_MELS = 128
    N_FFT = 2048
    HOP_LENGTH = 512
    FMIN = 50
    FMAX = 14000
    # Clip params (setting for 5s to align with submission policy)
    TARGET_DURATION_S = 5
    TARGET_SAMPLES = TARGET_DURATION_S * SAMPLE_RATE
    # Path for training audio
    TRAIN_AUDIO_PATH = DATA_PATH/"train_audio"
    TRAIN_METADATA_PATH = DATA_PATH/"train.csv"
    TAXONOMY_PATH = DATA_PATH/"taxonomy.csv"
    # additional paths here
    #TRAIN_SOUNDSCAPES_PATH = DATA_PATH/"train_soundscapes"
    #TEST_SOUNDSCAPES_PATH = DATA_PATH/"test_soundscapes"    

In [12]:
# Function to seed everything to ensure reproducibility
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # Change to true if input sizes are kept constant

In [13]:
# Setup 
cfg = Config()
seed_everything(cfg.SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

In [14]:
# Loading Taxonomy data for context
if cfg.TAXONOMY_PATH.exists():
    taxonomy_df = pd.read_csv(cfg.TAXONOMY_PATH)
    print("Taxonomy data loaded")
    print(taxonomy_df.head())
    print("\nClass distribution across taxa: ")
    print(taxonomy_df['class_name'].value_counts())
else:
    print(f"File not found at {cfg.TAXONOMY_PATH}")

Taxonomy data loaded
  primary_label  inat_taxon_id               scientific_name  \
0       1139490        1139490          Ragoniella pulchella   
1       1192948        1192948         Oxyprora surinamensis   
2       1194042        1194042           Copiphora colombiae   
3        126247         126247       Leptodactylus insularum   
4       1346504        1346504  Neoconocephalus brachypterus   

                    common_name class_name  
0          Ragoniella pulchella    Insecta  
1         Oxyprora surinamensis    Insecta  
2           Copiphora colombiae    Insecta  
3        Spotted Foam-nest Frog   Amphibia  
4  Neoconocephalus brachypterus    Insecta  

Class distribution across taxa: 
class_name
Aves        146
Amphibia     34
Insecta      17
Mammalia      9
Name: count, dtype: int64


Move to the utility file once this is tested

In [15]:
mel_spectrogram_tfms = T.MelSpectrogram(
    sample_rate = cfg.SAMPLE_RATE,
    n_fft = cfg.N_FFT,
    hop_length = cfg.HOP_LENGTH,
    n_mels = cfg.N_MELS,
    f_min = cfg.FMIN,
    f_max = cfg.FMAX
).to(device) # transforms are moved to the GPU

In [17]:
# Connverty power spec to DB
amp_to_db_tfms = T.AmplitudeToDB(stype='power', top_db=80).to(device) 

In [None]:
# Another one for the utility file
# Then clean up
def pre_process_audio_file(file_path: Path, target_sr=cfg.SAMPLE_RATE,
                          target_samples=cfg.TARGET_SAMPLES):
    """Loads an audio file, resamples, splits into 5s non-overlapping clips,
    and converts each clip to a Mel spectrogram tensor."""
    spectrograms = []
    try: 
        waveform, sr = torchaudio.load(filepath) #return tensor [channels, time]
        # Resample if needed
        if sr != target_sr:
            resampler = T.Resampler(sr, target_sr).to(waveform.device)
            waveform = resampler(waveform)
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        # Ensure waveform is on the correct device
        waveform = waveform.to(device)
        # Calculate number of full clips
        num_samples = waveform.shape[1]
        num_full_clips = num_samples // target_samples

        for i in range(num_full_clips):
            start_sample = i * target_samples
            end_sample = sample_sample + target_samples
            clip = waveform[:, start_sample:end_sample] # Keep channel dim for tfms
            # Generate Mel spectrogram
            mel_spec = mel_spectrogram_transform(clip) #Output: [channel, n_mels, time]
            # Convert to DB scale
            mel_spec_db = amplitude_to_db_transform(mel_spec)# Output [channel, n_mels, time]
            # Squeeze channel dims -> [n_mels, time]
            spectrograms.append(mel_spec_db.squeeze(0))

        # If we wanted to use the remainder for training, we could pad it here:
        # remaining_samples = num_samples % target_samples
        # if remaining_samples > 0 and end_behavior=='pad': # Example
        #    last_clip = waveform[:, num_full_clips*target_samples:]
        #    padding = target_samples - last_clip.shape[1]
        #    last_clip_padded = torch.nn.functional.pad(last_clip, (0, padding))
    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")
        # return empty list on error
        return []
    return spectrograms

In [None]:
# Test on some samples
if 'train_df' in globals