<a href="https://colab.research.google.com/github/bachaudhry/kaggle_birdCLEF_25/blob/main/BirdCLEF25_03_Baseline_Precomputed_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.environ["KAGGLE_CONFIG_DIR"] = "/content/drive/MyDrive/Kaggle"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q kaggle

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys, gc, random, math, time, copy , zipfile, tarfile, shutil, subprocess, json
from pathlib import Path
from tqdm.notebook import tqdm
import IPython.display as ipd
from IPython.display import display, clear_output
import ipywidgets as widgets

import librosa
import librosa.display
import soundfile as sf

import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.amp as amp

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, average_precision_score
from sklearn.preprocessing import LabelEncoder

In [4]:
sys.path.append('/content/drive/MyDrive/Kaggle/Bird_CLEF25/utils')
from utils import Config, BirdClefDataset, create_target_tensor, seed_everything, process_gzipped

In [5]:
cfg = Config()
# Path to original train.csv, audio and metadata
cfg.BASE_DATA_PATH = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/birdclef-2025")
# Path to npy files
cfg.PRECOMPUTED_SPECS_PATH = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/precomputed-specs-np-zipped")
# Path to local specs
#cfg.LOCAL_SPECS_PATH = Path("/content/precomputed_spectrograms")
# Training meta data
cfg.TRAIN_METADATA_PATH = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/birdclef-2025/train.csv")

In [6]:
# --- Set Device & Seed ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
seed_everything(cfg.SEED)

Using device: cuda
Seeded everything with: 42


In [7]:
if cfg.NUM_WORKERS > 0:
    try:
        current_context = mp.get_context(None)
        if not isinstance(current_context, mp.SpawnContext):
             mp.set_start_method('spawn', force=True)
             print("Set multiprocessing start method to 'spawn'.")
        else:
             print("Multiprocessing start method already set to 'spawn'.")
    except RuntimeError as e:
        print(f"Could not set start method (might be already set or first run): {e}")

Set multiprocessing start method to 'spawn'.


In [8]:
# --- Load Metadata ---
if not cfg.TRAIN_METADATA_PATH.exists():
    print(f"ERROR: Metadata file not found at {cfg.TRAIN_METADATA_PATH}")
    # Stop execution or handle
else:
    train_df = pd.read_csv(cfg.TRAIN_METADATA_PATH)
    print(f"Train metadata loaded. Shape: {train_df.shape}")

Train metadata loaded. Shape: (28564, 13)


In [None]:
#import gdown
## Testing improved download and unzip function
#def process_gzippedV2(input_path, output_path=None):
#  local_temp_dir = "/content/temp_data"
  #local_extract_path = os.path.join(local_temp_dir, "extracted")
 # os.makedirs(local_extract_path, exist_ok=True)
  #os.makedirs(local_temp_dir, exist_ok=True)

  # Download using gdown - timeout issues
  #print("Downloading compressed file from Google Drive...")
  #url = f'https://drive.google.com/uc?id={file_id.split("/")[-2]}'
  #compressed_path = gdown.download(url, output=local_temp_dir, quiet=False)
  # Ensure compressed_path is a file, not the directory
  #compressed_path = os.path.join(local_temp_dir, compressed_path)  # Corrected line

    # Get filename and local paths
  #filename = os.path.basename(input_path)
  #local_compressed_path = os.path.join(local_temp_dir, filename)
  #local_extract_path = os.path.join(local_temp_dir, 'extracted')
  #os.makedirs(local_extract_path, exist_ok=True)

  # Download file from Drive to Colab
  #print(f"Copying {filename} from Drive to Colab...")
  #drive_path = os.path.join('/content/drive/MyDrive/', input_path)
  #shutil.copy2(drive_path, local_compressed_path) # Slower but more robust

  # Extract with parallel decompression (if possible/available)
  #print(f"Extracting {os.path.basename(local_compressed_path)}...")
  #try:
    # Using pigz for parallel decompression
    #subprocess.run(['pigz', '--version'], check=True)
    #subprocess.run(['tar', '-I', 'pigz', '-xf', local_compressed_path, '-C', local_extract_path],
                   check=True)
  #except:
    # Fallback to tar
    #subprocess.run(['tar', '-xzf', local_compressed_path, '-C', local_extract_path],
                   check=True)

  # Clean up compressed file
  #os.remove(local_compressed_path)

  # Optional upload to drive
  #if output_path:
       #print("⏫ Starting Drive upload...")
        #drive_output_path = os.path.join('/content/drive/MyDrive', output_path)

        # Use parallel upload with rsync
        #subprocess.run([
            #'rsync', '-a', '--info=progress2',
            #local_extract_path + '/',
            #drive_output_path
        #], check=True)

  #print("✅ All operations completed!")
  #return local_extract_path


In [None]:
#file_id = 'https://drive.google.com/file/d/1Ji5acgpHlyyhd8vI8gyQlN1nkjh16MwN/view?usp=drive_link'
#input_path= "/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/precomputed-specs-np-zipped"
#ocal_extract = process_gzippedV2(input_path)

Copying precomputed-specs-np-zipped from Drive to Colab...
Extracting precomputed-specs-np-zipped...
✅ All operations completed!


In [10]:
# Download zipped folder and extract to local
input_path = "/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/precomputed-specs-np-zipped"
local_extract = process_gzipped(input_path)

Copying precomputed-specs-np-zipped from Drive to Colab...
Extracting precomputed-specs-np-zipped...


Extracting files: 100%|██████████| 187905/187905 [04:55<00:00, 635.29it/s]


Operation completed successfully!


In [11]:
# Update precomputed specs path
#cfg.PRECOMPUTED_SPECS_PATH = cfg.LOCAL_SPECS_PATH
# Run check
local_specs_path = Path("/content/temp_data/extracted/kaggle/working/precomputed_specs_np")
all_precomputed_files = list(local_specs_path.glob("*.npy"))
print(f"Found {len(all_precomputed_files)} precomputed .npy files.")

Found 187904 precomputed .npy files.


In [12]:
cfg.PRECOMPUTED_SPECS_PATH = local_specs_path

In [13]:
# Create label mappings
unique_labels = sorted(train_df['primary_label'].unique())
cfg.NUM_CLASSES = len(unique_labels)
cfg.LABEL_TO_INT = {label: i for i, label in enumerate(unique_labels)}
cfg.INT_TO_LABEL = {i: label for label, i in cfg.LABEL_TO_INT.items()}
train_df['primary_label_int'] = train_df['primary_label'].map(cfg.LABEL_TO_INT)
print(f"{cfg.NUM_CLASSES} unique classes found.")

206 unique classes found.




---



In [None]:
#manifest_path = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/")

This next step involves the creation of a manifest file to make the process of ensuring that preprocessed spectrograms are mapped to their respective originals in the train DF metadata file.

If done correctly this cell will only need to be run once so that a `manifest.csv` file is created and stored in Google Drive. Afterwards, this step can be commented out and will be replaced with a new cell to simply load the manifest file from source.



---



In [None]:
# --- CREATE MANIFEST REVISED  ---

# --- Load Metadata ---
#if not cfg.TRAIN_METADATA_PATH.exists():
#    print(f"ERROR: Metadata file not found at {cfg.TRAIN_METADATA_PATH}")
#    raise FileNotFoundError(f"Metadata missing: {cfg.TRAIN_METADATA_PATH}")

#train_df = pd.read_csv(cfg.TRAIN_METADATA_PATH)
#print(f"Train metadata loaded. Shape: {train_df.shape}")

# --- Create Label Mappings ---
#unique_labels = sorted(train_df['primary_label'].unique())
#cfg.NUM_CLASSES = len(unique_labels)
#cfg.LABEL_TO_INT = {label: i for i, label in enumerate(unique_labels)}
#cfg.INT_TO_LABEL = {i: label for label, i in cfg.LABEL_TO_INT.items()}
#train_df['primary_label_int'] = train_df['primary_label'].map(cfg.LABEL_TO_INT)
#print(f"{cfg.NUM_CLASSES} unique classes found.")

# --- Scan Precomputed Files and Create clip_samples & Manifest ---
#all_precomputed_files = list(cfg.PRECOMPUTED_SPECS_PATH.glob("*.npy"))
#print(f"Found {len(all_precomputed_files)} precomputed .npy files.")

#clip_samples = []
#manifest_data = []

#if not all_precomputed_files:
#    print("ERROR: No precomputed files found.")
#else:
#    print("Generating clip_info_list and manifest from precomputed files and train_df...")

    # --- Create the lookup dictionary ---
    # Key: The part of the .npy filename BEFORE "_clipIDX"
    # Value: (primary_label_int, original_filename_from_train_df)
#    filename_to_label_map = {}
#    for _, row in train_df.iterrows():
#        original_filename_from_train_df = row['filename']
        # THIS IS THE KEY ASSUMPTION:
        # The key should be the original filename as it appears in train_df,
        # if your .npy files are like "TRAIN_DF_FILENAME_clipIDX.npy"
        # If train_df filenames have '/', and .npy files have them replaced with '_', adjust here.
        # For now, assume .npy naming directly uses train_df['filename'] (potentially with slashes replaced)

        # Let's assume your .npy files look like: "original_filename_from_train_df_WITH_SLASHES_REPLACED_clipIDX.npy"
        # AND that ".ogg" is part of this stem in the .npy file.
#        key_for_map = original_filename_from_train_df.replace('/', '_') # Example: "subdir_file.ogg"

#        filename_to_label_map[key_for_map] = (row['primary_label_int'], original_filename_from_train_df)

    # --- DEBUG: Print some keys from the map ---
 #   print(f"Generated {len(filename_to_label_map)} keys for lookup map.")
 #   print("Sample keys from filename_to_label_map (first 5):")
 #   for i, key in enumerate(filename_to_label_map.keys()):
 #       if i < 5:
 #           print(f"  '{key}' -> {filename_to_label_map[key]}")
 #       else:
 #           break
    # --- END DEBUG ---

 #   processed_count = 0
 #   for spec_path in tqdm(all_precomputed_files, desc="Mapping precomputed files"):
 #       try:
 #           npy_filename_stem_full = spec_path.stem # e.g., "21211_XC934741.ogg_clip0"
 #           parts = npy_filename_stem_full.rsplit('_clip', 1) # Use rsplit to split on the *last* occurrence

 #           if len(parts) != 2:
 #               print(f"Warning: Could not parse clip index from '{spec_path.name}' using '_clip'. Skipping. Full stem: '{npy_filename_stem_full}'")
 #               continue

 #           parsed_name_stem_from_npy = parts[0] # This should be the key we look up, e.g., "21211_XC934741.ogg"
                                                # or "ebird_code_XC12345.ogg"

            # --- DEBUG: Print parsed stem from .npy ---
 #           if processed_count < 5: # Print for the first 5 .npy files
 #                print(f"  Attempting to match .npy parsed stem: '{parsed_name_stem_from_npy}'")
            # --- END DEBUG ---

 #           clip_idx_str = parts[1]
 #           if not clip_idx_str.isdigit():
 #               print(f"Warning: Clip index part '{clip_idx_str}' is not a digit in '{spec_path.name}'. Skipping.")
 #               continue
 #           clip_idx = int(clip_idx_str)


#            if parsed_name_stem_from_npy in filename_to_label_map:
#                primary_label_int, original_full_filename = filename_to_label_map[parsed_name_stem_from_npy]

#                clip_entry = {
#                    'original_filename': original_full_filename,
#                    'spec_npy_filename': spec_path.name,
#                    'clip_index': clip_idx,
#                    'primary_label_int': primary_label_int
#                }
#                clip_samples.append(clip_entry)
#                manifest_data.append(clip_entry)
#            else:
#                if processed_count < 20: # Print warning only for the first few mismatches to avoid flooding
#                    print(f"Warning: No match in train_df lookup for parsed .npy stem '{parsed_name_stem_from_npy}' from file '{spec_path.name}'")

#            processed_count += 1

#        except Exception as e:
#            print(f"Error parsing or mapping {spec_path.name}: {e}")

#    cfg.TOTAL_CLIPS = len(clip_samples)
#    print(f"Created {cfg.TOTAL_CLIPS} clip samples from precomputed files.")

#    if not clip_samples:
#         print("CRITICAL ERROR: clip_samples list is empty. Check mapping logic and .npy filenames based on debug output.")
#    else:
#        manifest_df = pd.DataFrame(manifest_data)
#        manifest_save_path = "/content/temp_data/manifest.csv"
#        try:
#            manifest_df.to_csv(manifest_save_path, index=False)
#            print(f"Manifest file saved to: {manifest_save_path}")
#            print(manifest_df.head())
#        except Exception as e:
#            print(f"Error saving manifest to {manifest_save_path}: {e}")
#            print("Attempting to save to /content/manifest.csv instead...")
#            manifest_save_path_content = Path("/content/manifest.csv")
#            manifest_df.to_csv(manifest_save_path_content, index=False)
#            print(f"Manifest file saved to: {manifest_save_path_content}")
#            print(manifest_df.head())



Train metadata loaded. Shape: (28564, 13)
206 unique classes found.
Found 187904 precomputed .npy files.
Generating clip_info_list and manifest from precomputed files and train_df...
Generated 28564 keys for lookup map.
Sample keys from filename_to_label_map (first 5):
  '1139490_CSA36385.ogg' -> (0, '1139490/CSA36385.ogg')
  '1139490_CSA36389.ogg' -> (0, '1139490/CSA36389.ogg')
  '1192948_CSA36358.ogg' -> (1, '1192948/CSA36358.ogg')
  '1192948_CSA36366.ogg' -> (1, '1192948/CSA36366.ogg')
  '1192948_CSA36373.ogg' -> (1, '1192948/CSA36373.ogg')


Mapping precomputed files:   0%|          | 0/187904 [00:00<?, ?it/s]

  Attempting to match .npy parsed stem: 'strowl1_XC48736.ogg'
  Attempting to match .npy parsed stem: 'roahaw_XC113755.ogg'
  Attempting to match .npy parsed stem: 'bkcdon_XC250791.ogg'
  Attempting to match .npy parsed stem: 'bkmtou1_XC821017.ogg'
  Attempting to match .npy parsed stem: 'yebsee1_XC216248.ogg'
Created 187904 clip samples from precomputed files.
Manifest file saved to: /content/temp_data/manifest.csv
      original_filename                spec_npy_filename  clip_index  \
0   strowl1/XC48736.ogg    strowl1_XC48736.ogg_clip1.npy           1   
1   roahaw/XC113755.ogg    roahaw_XC113755.ogg_clip1.npy           1   
2   bkcdon/XC250791.ogg    bkcdon_XC250791.ogg_clip2.npy           2   
3  bkmtou1/XC821017.ogg  bkmtou1_XC821017.ogg_clip17.npy          17   
4  yebsee1/XC216248.ogg  yebsee1_XC216248.ogg_clip11.npy          11   

   primary_label_int  
0                175  
1                144  
2                 70  
3                 71  
4                197  


In [15]:
# Load manifest data
manifest_load_path = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/manifest.csv")
if manifest_load_path.exists():
    manifest_df = pd.read_csv(manifest_load_path)
    print(f"Manifest loaded from {manifest_load_path}")
    print(manifest_df.head(15))

    # Ensure correct data types
    manifest_df['clip_index'] = manifest_df['clip_index'].astype(int)
    manifest_df['primary_label_int'] = manifest_df['primary_label_int'].astype(int)

    clip_samples = manifest_df.to_dict('records') # Convert DF rows to list of dicts
    cfg.TOTAL_CLIPS = len(clip_samples)
    print(f"Loaded {cfg.TOTAL_CLIPS} clip samples from mainfest.")
    print("Sample loaded clip_info: ", clip_samples[0] if clip_samples else "N/A")
else:
  print(f"Error: Manifest not found at {manifest_load_path}")
  raise FileNotFoundError(f"Manifest missing: {manifest_load_path}")

Manifest loaded from /content/drive/MyDrive/Kaggle/Bird_CLEF25/data/manifest.csv
          original_filename                  spec_npy_filename  clip_index  \
0       strowl1/XC48736.ogg      strowl1_XC48736.ogg_clip1.npy           1   
1       roahaw/XC113755.ogg      roahaw_XC113755.ogg_clip1.npy           1   
2       bkcdon/XC250791.ogg      bkcdon_XC250791.ogg_clip2.npy           2   
3      bkmtou1/XC821017.ogg    bkmtou1_XC821017.ogg_clip17.npy          17   
4      yebsee1/XC216248.ogg    yebsee1_XC216248.ogg_clip11.npy          11   
5       roahaw/XC925789.ogg     roahaw_XC925789.ogg_clip16.npy          16   
6   grbhaw1/iNat1156713.ogg  grbhaw1_iNat1156713.ogg_clip0.npy           0   
7       grnkin/XC452703.ogg      grnkin_XC452703.ogg_clip1.npy           1   
8      paltan1/XC375634.ogg    paltan1_XC375634.ogg_clip27.npy          27   
9      cotfly1/XC817211.ogg     cotfly1_XC817211.ogg_clip2.npy           2   
10     wbwwre1/XC456410.ogg     wbwwre1_XC456410.ogg_clip0.np

## Training / Validation Split

In [None]:
if 'clip_samples' not in globals() or not clip_samples:
  print("Error: clip_samples is not loaded, run the previous steps first!")
  raise ValueError("clip_samples not loaded. Can't proceed with data splitting.")
else:
  clip_df_for_split = pd.DataFrame(clip_samples)

  # Filter classes with only one sample - for stratification
  label_counts = clip_df_for_split['primary_label_int'].value_counts()
  single_sample_labels = label_counts[label_counts == 1].index.tolist()

  if single_sample_labels:
    print(f"Found {len(single_sample_labels)} classes with only 1 precomputed sample clip.")
    # Classes being removed
    clip_df_filtered = clip_df_for_split[~clip_df_for_split['primary_label_int'].isin(single_sample_labels)].copy()
    removed_count = len(clip_df_for_split) - len(clip_df_filtered)
    print(f"Removed {removed_count} clips belonging to single sample classes")
    print(f"Remaining clips for splitting: {len(clip_df_filtered)}")
  else:
    clip_df_filtered = clip_df_for_split.copy()
    print("No classes with only 1 precomputed sample clip found. No filtering applied.")

  if not clip_df_filtered.empty:
    # Indeces of the filteredd dataframe for splitting
    features = clip_df_filtered.index
    labels = clip_df_filtered['primary_label_int']

    try:
      train_indeces, val_indeces = train_test_split(
          features, # split on the df index
          test_size=0.2,
          random_state=cfg.SEED,
          stratify=labels
      )
      # Train and validation lists of clip info dicts
      train_clip_info = clip_df_filtered.loc[train_indeces].to_dict('records')
      val_clip_info = clip_df_filtered.loc[val_indeces].to_dict('records')

      print(f"Training clips: {len(train_clip_info)}")
      print(f"Validation clips: {len(val_clip_info)}")

      # Verify stratification - check distributions in train and val
      train_labels_dist = pd.Series([d['primary_label_int'] for d in train_clip_info]).value_counts(normalize=True)
      val_labels_dist = pd.Series([d['primary_label_int'] for d in val_clip_info]).value_counts(normalize=True)
      print("Example class proportions")
