In [None]:
import pandas as pd


df = pd.read_csv('detections_master.csv')

c = df[df['confidence'] > 0.94]
print(c['species_common'].unique())



["Richard's Pipit" 'Black-crowned Night-Heron' 'Caspian Tern'
 'Asian Palm-Swift' 'White-breasted Waterhen' 'Oriental Scops-Owl'
 'Cattle Egret' 'Common Greenshank']


In [14]:
import pandas as pd
import os
from pydub import AudioSegment
from pathlib import Path

# Configuration
CONFIDENCE_THRESHOLD = 0.94
NUM_CLIPS_PER_SPECIES = 5
AUDIO_DIR = Path('data/all-audio')
OUTPUT_DIR = Path('high_confidence_clips')

# Target species
TARGET_SPECIES = [
    "Richard's Pipit",
    'Black-crowned Night-Heron',
    'Caspian Tern',
    'Asian Palm-Swift',
    'White-breasted Waterhen',
    'Oriental Scops-Owl',
    'Cattle Egret',
    'Common Greenshank'
]

def extract_audio_clip(audio_path, start_sec, end_sec, output_path):
    """Extract a segment from an audio file."""
    try:
        # Load the audio file
        audio = AudioSegment.from_wav(audio_path)

        # Extract the segment (pydub uses milliseconds)
        start_ms = int(start_sec * 1000)
        end_ms = int(end_sec * 1000)
        clip = audio[start_ms:end_ms]

        # Export the clip
        clip.export(output_path, format='wav')
        return True
    except Exception as e:
        print(f"Error extracting clip from {audio_path}: {e}")
        return False

def main():
    # Read the detections CSV
    df = pd.read_csv('detections_master.csv')

    # Filter for high confidence detections
    high_conf_df = df[df['confidence'] > CONFIDENCE_THRESHOLD]

    # Create output directory
    OUTPUT_DIR.mkdir(exist_ok=True)

    # Process each target species
    for species in TARGET_SPECIES:
        print(f"\n{'='*60}")
        print(f"Processing: {species}")
        print(f"{'='*60}")

        # Filter for this species
        species_df = high_conf_df[high_conf_df['species_common'] == species]

        if len(species_df) == 0:
            print(f"  No high-confidence detections found for {species}")
            continue

        # Sort by confidence (descending) and take top N
        top_detections = species_df.nlargest(NUM_CLIPS_PER_SPECIES, 'confidence')

        print(f"  Found {len(species_df)} high-confidence detections")
        print(f"  Extracting top {len(top_detections)} clips")

        # Create species directory
        species_dir = OUTPUT_DIR / species.replace("'", "").replace(" ", "_")
        species_dir.mkdir(exist_ok=True)

        # Extract each clip
        for idx, row in top_detections.iterrows():
            # Construct the full path to the audio file
            audio_file_path = AUDIO_DIR / row['file_name']

            if not audio_file_path.exists():
                print(f"  WARNING: Audio file not found: {audio_file_path}")
                continue

            # Create output filename with metadata
            species_name_clean = row['species_common'].replace(' ', '_').replace("'", '')
            output_filename = (
                f"{species_name_clean}_"
                f"conf{row['confidence']:.3f}_"
                f"{row['date']}_"
                f"{row['hour']:02d}{row['minute']:02d}_"
                f"site{row['site_id']}_"
                f"sensor{row['sensor_id']}.wav"
            )

            output_path = species_dir / output_filename

            # Extract the clip
            success = extract_audio_clip(
                audio_file_path,
                row['start_time_sec'],
                row['end_time_sec'],
                output_path
            )

            if success:
                print(f"  ✓ Extracted: {output_filename}")
                print(f"    Confidence: {row['confidence']:.4f}")
                print(f"    Time: {row['start_time_sec']:.1f}s - {row['end_time_sec']:.1f}s")
            else:
                print(f"  ✗ Failed: {output_filename}")

    print(f"\n{'='*60}")
    print(f"Extraction complete! Clips saved to: {OUTPUT_DIR.absolute()}")
    print(f"{'='*60}")


main()



Processing: Richard's Pipit
  Found 1 high-confidence detections
  Extracting top 1 clips
  ✓ Extracted: Richards_Pipit_conf0.962_2025-02-04_0100_site3_sensor7901.wav
    Confidence: 0.9624
    Time: 135.0s - 138.0s

Processing: Black-crowned Night-Heron
  Found 480 high-confidence detections
  Extracting top 5 clips
  ✓ Extracted: Black-crowned_Night-Heron_conf1.000_2025-02-05_0515_site4_sensor7902.wav
    Confidence: 0.9995
    Time: 354.0s - 357.0s
  ✓ Extracted: Black-crowned_Night-Heron_conf0.999_2025-02-04_1815_site3_sensor7901.wav
    Confidence: 0.9994
    Time: 372.0s - 375.0s
  ✓ Extracted: Black-crowned_Night-Heron_conf0.999_2025-02-04_1800_site3_sensor7901.wav
    Confidence: 0.9994
    Time: 117.0s - 120.0s
  ✓ Extracted: Black-crowned_Night-Heron_conf0.999_2025-02-05_0530_site4_sensor7902.wav
    Confidence: 0.9991
    Time: 378.0s - 381.0s
  ✓ Extracted: Black-crowned_Night-Heron_conf0.999_2025-02-04_1800_site3_sensor7901.wav
    Confidence: 0.9989
    Time: 63.0s - 66.

main()