In [1]:
import pandas as pd
from pathlib import Path
from opensoundscape import Audio
from collections import Counter
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


**Paths to input files and directories:**

In [2]:
# These paths never change
tracker_file = '/mnt/d/retraining_BirdNET_2025/model_train/train_set_5/training_set_tracker.csv'
audio_files_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/input_20%_2/'

**Change the paths below**

Remember during **iteration 1** the results will come from **model_0**. On **iteration 2** the results will come from **model_1** and so on.. 

In [3]:
# In this change the model number. 
annotations_file = '/mnt/d/retraining_BirdNET_2025/iterative_training/results/model_5/BirdNET_SelectionTable.txt'

# Change the path according to current iteration. 
output_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/it_6/uncertainty/'

# Useful from iteration 2 onwards. Remember to use precision computed on the previous iteration. 
precision_file = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/precision_it5.csv'  # Optional after iteration 1

# Parameters
num_clips_default = 32  # Default number of clips to select

In [4]:
# Load BirdNET predictions
predictions = pd.read_csv(annotations_file, sep='\t', usecols=['Begin Path', 'File Offset (s)', 'Common Name', 'Confidence'])
predictions.rename(columns={
    'Begin Path': 'file_path',
    'File Offset (s)': 'offset',
    'Common Name': 'class',
    'Confidence': 'score'
}, inplace=True)

In [5]:
# Construct `offset_filename` using ONLY offset and filename with extension
predictions['offset_filename'] = predictions.apply(
    lambda row: f"{int(row['offset'])}_{Path(row['file_path']).name}", axis=1  # Keep original extension
)

In [6]:
# Function to extract elements 1 to 5 of the filename for comparison    
def extract_core_filename(filename):
    parts = Path(filename).stem.split('_') # Remove extension and split filename 
    if len(parts) >=5:
        return '_'.join(parts[1:6]) # Ignore the first element (score) and keep the next 5
    return None
    

# Load training set tracker
if Path(tracker_file).exists():
    tracker = pd.read_csv(tracker_file)
    
    # Only consider files added in iteration 1 and onwards
    tracker = tracker[tracker['iteration'] >= 1]
    
    # Extract core filename
    tracker['core_filename'] = tracker['file'].apply(
        lambda x: extract_core_filename(Path(x).name)
    )
    tracker_filenames = tracker['core_filename'].tolist() #added drop na chat GPT
else:
    tracker_filenames = []

In [7]:
# Add a column with offset_filename but witouh extension for comparison with tracker_filenames
predictions['core_filename'] = predictions['offset_filename'].apply(
    lambda x: Path(x).stem
)

# Filter out clips already in the training set
predictions = predictions[~predictions['core_filename'].isin(tracker_filenames)]

# Display result for verification
print(f"Filtered predictions: {len(predictions)} clips remaining")  #Always check that after filtering you should have less number than full model predictions. 

Filtered predictions: 2502255 clips remaining


In [8]:
# Optional: Load precision data for iteration > 1
if Path(precision_file).exists():
    precision_data = pd.read_csv(precision_file)
    precision_dict = dict(zip(precision_data['class'], precision_data['precision']))
else:
    precision_dict = {}

In [9]:
# Group by class and select top-scoring clips with site ID limit

selected_clips = []
for class_name, group in predictions.groupby('class'):
    if class_name.lower() == 'nocall':
        continue  # Skip 'nocall' predictions

    # Sort by score in descending order
    group = group.sort_values(by='score', ascending=False)

    # Determine number of clips to select based on precision
    num_clips = num_clips_default
    site_limit = 20  # Default limit for classes with low precision

    if class_name in precision_dict and precision_dict[class_name] > 0.5:
        num_clips = 64
        site_limit = 40  # Higher limit for classes with good precision

    # Counter to track how many clips per site ID
    site_counter = Counter()
    selected = []

    for _, row in group.iterrows():
        site_id = row['core_filename'].split('_')[1]  # Is the second element = site ID

        # Check site limit before adding
        if site_counter[site_id] < site_limit:
            selected.append(row)
            site_counter[site_id] += 1
        
        # Stop once we have enough clips
        if len(selected) == num_clips:
            break

    # Add the selected clips for this class to the final list
    selected_clips.extend(selected)


In [10]:
# Combine all selected clips into a single DataFrame
if selected_clips:
    selected_df = pd.DataFrame(selected_clips)

    # Save selected clips into class-specific directories
    for _, row in selected_df.iterrows():
        # Extract class and output directory
        class_name = row['class']
        class_dir = Path(output_dir) / class_name
        class_dir.mkdir(parents=True, exist_ok=True)

        # Construct the output filename (restoring the score for sorting)
        score = row['score']
        offset = int(row['offset'])
        filename = Path(row['file_path']).name
        output_filename = f"{score}_{offset}_{filename}"  # Keep original extension

        # Load and save the audio segment
        audio_path = Path(audio_files_dir) / filename
        try:
            audio = Audio.from_file(audio_path, offset=offset, duration=3)  # Assuming 3-second segments
            audio.save(class_dir / output_filename)
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")

    print(f"Selected clips have been saved to: {output_dir}")
else:
    print("No clips selected for validation.")



Selected clips have been saved to: /mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/it_6/uncertainty/
