In [10]:
import pandas as pd
from pathlib import Path
from opensoundscape import Audio


In [11]:
# File paths
annotations_file = '/mnt/d/retraining_BirdNET_2025/iterative_training/results/model_10_M12/BirdNET_SelectionTable.txt'
output_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/it_M10/'
# Parameters
num_clips_default = 80  # Default number of clips to select

In [12]:
# Load BirdNET predictions
predictions = pd.read_csv(annotations_file, sep='\t', usecols=['Begin Path', 'File Offset (s)', 'Common Name', 'Confidence'])
predictions.rename(columns={
    'Begin Path': 'file_path',
    'File Offset (s)': 'offset',
    'Common Name': 'class',
    'Confidence': 'score'
}, inplace=True)


In [13]:
# Construct `offset_filename` using ONLY offset and filename with extension
predictions['offset_filename'] = predictions.apply(
    lambda row: f"{int(row['offset'])}_{Path(row['file_path']).name}", axis=1
)

In [14]:
# Function to extract elements 1 to 5 of the filename for comparison
def extract_core_filename(filename):
    parts = Path(filename).stem.split('_')
    return '_'.join(parts[1:6])

In [15]:
# Apply extraction to predictions
predictions['core_filename'] = predictions['offset_filename'].apply(extract_core_filename)

# Display result for verification
print(f"Total predictions: {len(predictions)} clips")

# Group by class and select top-scoring clips
selected_clips = []
for class_name, group in predictions.groupby('class'):
    if class_name.lower() == 'nocall':
        continue  # Skip 'nocall' predictions

    # Sort by score in descending order
    group = group.sort_values(by='score', ascending=False)

    # Select top N clips (you can customize this per class if needed)
    selected_clips.append(group.head(num_clips_default))


Total predictions: 48616 clips


In [16]:
# Combine all selected clips into a single DataFrame
if selected_clips:
    selected_df = pd.concat(selected_clips, ignore_index=True)

    for _, row in selected_df.iterrows():
        class_name = row['class']
        class_dir = Path(output_dir) / class_name
        class_dir.mkdir(parents=True, exist_ok=True)

        score = row['score']
        offset = int(row['offset'])

        audio_path = Path(row['file_path'])  # Use full path directly
        filename = audio_path.name
        output_filename = f"{score}_{offset}_{filename}"

        try:
            audio = Audio.from_file(audio_path, offset=offset, duration=3)
            audio.save(class_dir / output_filename)
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
else:
    print("No clips selected for validation.")