In [3]:
import pandas as pd
from pathlib import Path
from opensoundscape import Audio

  from tqdm.autonotebook import tqdm


In [4]:
# Paths to input files and directories
annotations_file = '/mnt/d/retraining_BirdNET_2025/iterative_training/results/model_0/BirdNET_SelectionTable.txt'
tracker_file = '/mnt/d/retraining_BirdNET_2025/model_train/training_set_tracker.csv'
audio_files_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/input_20%/'
output_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/model_1/uncertainty/'
precision_file = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/model_0/precision_per_class.csv'  # Optional after iteration 1

# Parameters
num_clips_default = 32  # Default number of clips to select

In [5]:
# Load BirdNET predictions
predictions = pd.read_csv(annotations_file, sep='\t', usecols=['Begin Path', 'File Offset (s)', 'Common Name', 'Confidence'])
predictions.rename(columns={
    'Begin Path': 'file_path',
    'File Offset (s)': 'offset',
    'Common Name': 'class',
    'Confidence': 'score'
}, inplace=True)

In [6]:
# Construct `offset_filename` using ONLY offset and filename with extension
predictions['offset_filename'] = predictions.apply(
    lambda row: f"{int(row['offset'])}_{Path(row['file_path']).name}", axis=1  # Keep original extension
)

Try alternative filtering condition

In [7]:
# Load training set tracker
if Path(tracker_file).exists():
    tracker = pd.read_csv(tracker_file)

    # Only consider files added in iteration 1 and onwards
    tracker = tracker[tracker['iteration'] >= 1]

    # Extract only the first 5 elements of the filename for comparison
    def extract_core_filename(filename):
        parts = Path(filename).stem.split('_')  # Remove extension and split filename
        return '_'.join(parts[:5])  # Keep only first 5 elements

    tracker['core_filename'] = tracker['file'].apply(lambda x: extract_core_filename(Path(x).name))
    tracker_filenames = tracker['core_filename'].tolist()
else:
    tracker_filenames = []

In [8]:
# Apply the same logic to predictions
predictions['core_filename'] = predictions['offset_filename'].apply(lambda x: extract_core_filename(x))

# Filter out clips already in the training set (ignoring score & annotations)
predictions = predictions[~predictions['core_filename'].isin(tracker_filenames)]

# Display result for verification
print(f"Filtered predictions: {len(predictions)} clips remaining")

Filtered predictions: 196076 clips remaining


In [9]:
# Optional: Load precision data for iteration > 1
if Path(precision_file).exists():
    precision_data = pd.read_csv(precision_file)
    precision_dict = dict(zip(precision_data['class'], precision_data['precision']))
else:
    precision_dict = {}

In [10]:
# Group by class and select top-scoring clips
selected_clips = []
for class_name, group in predictions.groupby('class'):
    if class_name.lower() == 'nocall':
        continue  # Skip 'nocall' predictions

    # Sort by score in descending order
    group = group.sort_values(by='score', ascending=False)

    # Determine number of clips to select based on precision
    num_clips = num_clips_default
    if class_name in precision_dict and precision_dict[class_name] > 0.5:
        num_clips = 64

    # Select top-scoring clips
    selected_clips.append(group.head(num_clips))

In [11]:
# Combine all selected clips into a single DataFrame
if selected_clips:
    selected_df = pd.concat(selected_clips, ignore_index=True)

    # Save selected clips into class-specific directories
    for _, row in selected_df.iterrows():
        # Extract class and output directory
        class_name = row['class']
        class_dir = Path(output_dir) / class_name
        class_dir.mkdir(parents=True, exist_ok=True)

        # Construct the output filename (restoring the score for sorting)
        score = row['score']
        offset = int(row['offset'])
        filename = Path(row['file_path']).name
        output_filename = f"{score}_{offset}_{filename}"  # Keep original extension

        # Load and save the audio segment
        audio_path = Path(audio_files_dir) / filename
        try:
            # Extract the audio segment using Opensoundscape
            audio = Audio.from_file(audio_path, offset=offset, duration=3)  # Assuming 3-second segments
            audio.save(class_dir / output_filename)
            #print(f"Saved: {output_filename} to {class_dir}")
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
            
    print(f"Selected clips have been saved to: {output_dir}")
else:
    print("No clips selected for validation.")



Selected clips have been saved to: /mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/model_1/uncertainty/
