<a href="https://colab.research.google.com/github/bachaudhry/kaggle_birdCLEF_25/blob/main/BirdCLEF25_03_Baseline_Precomputed_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.environ["KAGGLE_CONFIG_DIR"] = "/content/drive/MyDrive/Kaggle"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q kaggle

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys, gc, random, math, time, copy , zipfile, tarfile, shutil, subprocess, json
from pathlib import Path
from tqdm.notebook import tqdm
import IPython.display as ipd
from IPython.display import display, clear_output
import ipywidgets as widgets

import librosa
import librosa.display
import soundfile as sf

import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.amp as amp

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, average_precision_score
from sklearn.preprocessing import LabelEncoder

In [None]:
sys.path.append('/content/drive/MyDrive/Kaggle/Bird_CLEF25/utils')
from utils import Config, BirdClefDataset, create_target_tensor, seed_everything, process_gzipped

In [None]:
cfg = Config()
# Path to original train.csv, audio and metadata
cfg.BASE_DATA_PATH = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/birdclef-2025")
# Path to npy files
cfg.PRECOMPUTED_SPECS_PATH = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/precomputed-specs-np-zipped")
# Path to local specs
#cfg.LOCAL_SPECS_PATH = Path("/content/precomputed_spectrograms")
# Training meta data
cfg.TRAIN_METADATA_PATH = Path("/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/birdclef-2025/train.csv")

In [None]:
# --- Set Device & Seed ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
seed_everything(cfg.SEED)

Using device: cpu
Seeded everything with: 42


In [None]:
if cfg.NUM_WORKERS > 0:
    try:
        current_context = mp.get_context(None)
        if not isinstance(current_context, mp.SpawnContext):
             mp.set_start_method('spawn', force=True)
             print("Set multiprocessing start method to 'spawn'.")
        else:
             print("Multiprocessing start method already set to 'spawn'.")
    except RuntimeError as e:
        print(f"Could not set start method (might be already set or first run): {e}")

Set multiprocessing start method to 'spawn'.


In [None]:
# --- Load Metadata ---
if not cfg.TRAIN_METADATA_PATH.exists():
    print(f"ERROR: Metadata file not found at {cfg.TRAIN_METADATA_PATH}")
    # Stop execution or handle
else:
    train_df = pd.read_csv(cfg.TRAIN_METADATA_PATH)
    print(f"Train metadata loaded. Shape: {train_df.shape}")

Train metadata loaded. Shape: (28564, 13)


In [None]:
!pip install gdown
!apt-get install pigz rsync -qq



In [None]:
import gdown
## Testing improved download and unzip function
def process_gzippedV2(file_id, output_path=None):
  local_temp_dir = "/content/temp_data"
  local_extract_path = os.path.join(local_temp_dir, "extracted")
  os.makedirs(local_extract_path, exist_ok=True)

  # Download using gdown
  print("Downloading compressed file from Google Drive...")
  url = f'https://drive.google.com/uc?id={file_id.split("/")[-2]}'
  compressed_path = gdown.download(url, output=local_temp_dir, quiet=False)

  # Ensure compressed_path is a file, not the directory
  compressed_path = os.path.join(local_temp_dir, compressed_path)  # Corrected line

  # Extract with parallel decompression (if possible/available)
  print(f"Extracting {os.path.basename(compressed_path)}...")
  try:
    # Using pigz for parallel decompression
    subprocess.run(['pigz', '--version'], check=True)
    subprocess.run(['tar', '-I', 'pigz', '-xf', compressed_path, '-C', local_extract_path],
                   check=True)
  except:
    # Fallback to tar
    subprocess.run(['tar', '-xzf', compressed_path, '-C', local_extract_path],
                   check=True)

  # Clean up compressed file
  os.remove(compressed_path)

  # Optional upload to drive
  if output_path:
        print("⏫ Starting Drive upload...")
        drive_output_path = os.path.join('/content/drive/MyDrive', output_path)

        # Use parallel upload with rsync
        subprocess.run([
            'rsync', '-a', '--info=progress2',
            local_extract_path + '/',
            drive_output_path
        ], check=True)

  print("✅ All operations completed!")
  return local_extract_path


In [None]:
file_id = 'https://drive.google.com/file/d/1Ji5acgpHlyyhd8vI8gyQlN1nkjh16MwN/view?usp=drive_link'
local_extract = process_gzippedV2(file_id)

Downloading compressed file from Google Drive...


FileURLRetrievalError: Failed to retrieve file url:

	Too many users have viewed or downloaded this file recently. Please
	try accessing the file again later. If the file you are trying to
	access is particularly large or is shared with many people, it may
	take up to 24 hours to be able to view or download the file. If you
	still can't access a file after 24 hours, contact your domain
	administrator.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1Ji5acgpHlyyhd8vI8gyQlN1nkjh16MwN

but Gdown can't. Please check connections and permissions.

In [None]:
# Download zipped folder and extract to local
input_path = "/content/drive/MyDrive/Kaggle/Bird_CLEF25/data/precomputed-specs-np-zipped"
local_extract = process_gzipped(input_path)

Copying precomputed-specs-np-zipped from Drive to Colab...
Extracting precomputed-specs-np-zipped...


Extracting files: 100%|██████████| 187905/187905 [06:03<00:00, 517.40it/s]


Operation completed successfully!


In [None]:
# Update precomputed specs path
#cfg.PRECOMPUTED_SPECS_PATH = cfg.LOCAL_SPECS_PATH
# Run check
local_specs_path = Path("/content/temp_data/extracted/kaggle/working/precomputed_specs_np")
all_precomputed_files = list(local_specs_path.glob("*.npy"))
print(f"Found {len(all_precomputed_files)} precomputed .npy files.")

Found 187904 precomputed .npy files.


In [None]:
# Create label mappings
unique_labels = sorted(train_df['primary_label'].unique())
cfg.NUM_CLASSES = len(unique_labels)
cfg.LABEL_TO_INT = {label: i for i, label in enumerate(unique_labels)}
cfg.INT_TO_LABEL = {i: label for label, i in cfg.LABEL_TO_INT.items()}
train_df['primary_label_int'] = train_df['primary_label'].map(cfg.LABEL_TO_INT)
print(f"{cfg.NUM_CLASSES} unique classes found.")

206 unique classes found.
