In [1]:
import pandas as pd
from pathlib import Path
from opensoundscape import Audio

  from tqdm.autonotebook import tqdm


**Paths to input files and directories:**

In [11]:
# These paths never change
tracker_file = '/mnt/d/retraining_BirdNET_2025/model_train/train_set_9/training_set_tracker.csv'
audio_files_dir = '/mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/'

**Change the paths below**

Remember during **iteration 1** the results will come from **model_0**. On **iteration 2** the results will come from **model_1** and so on.. 

In [12]:
# In this change the model number. 
annotations_file = '/mnt/d/retraining_BirdNET_2025/iterative_training/results/model_10_M12/BirdNET_SelectionTable.txt'

# Change the path according to current iteration. 
output_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/it_M10/'

# Useful from iteration 2 onwards. Remember to use precision computed on the previous iteration. 
precision_file = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/it_1_s1_t0.01/uncertainty/precision_it1.csv'  # Optional after iteration 1

# Parameters
num_clips_default = 80  # Default number of clips to select

In [13]:
# Load BirdNET predictions
predictions = pd.read_csv(annotations_file, sep='\t', usecols=['Begin Path', 'File Offset (s)', 'Common Name', 'Confidence'])
predictions.rename(columns={
    'Begin Path': 'file_path',
    'File Offset (s)': 'offset',
    'Common Name': 'class',
    'Confidence': 'score'
}, inplace=True)

In [14]:
# Construct `offset_filename` using ONLY offset and filename with extension
predictions['offset_filename'] = predictions.apply(
    lambda row: f"{int(row['offset'])}_{Path(row['file_path']).name}", axis=1  # Keep original extension
)

In [15]:
# Function to extract elements 1 to 5 of the filename for comparison
def extract_core_filename(filename):
    parts = Path(filename).stem.split('_')  # Remove extension and split filename
    return '_'.join(parts[1:6])  # Extract elements [1] to [5] 

# Load training set tracker
if Path(tracker_file).exists():
    tracker = pd.read_csv(tracker_file)
    
    # Only consider files added in iteration 1 and onwards
    tracker = tracker[tracker['iteration'] >= 1]
    
    # Extract core filename
    tracker['core_filename'] = tracker['file'].apply(lambda x: extract_core_filename(Path(x).name))
    tracker_filenames = tracker['core_filename'].tolist()
else:
    tracker_filenames = []

In [16]:
# Apply extraction to predictions
predictions['core_filename'] = predictions['offset_filename'].apply(extract_core_filename)

# Filter out clips already in the training set
predictions = predictions[~predictions['core_filename'].isin(tracker_filenames)]

# Display result for verification
print(f"Filtered predictions: {len(predictions)} clips remaining")  #HAVE TO BE AROUND 700,000 or more 

Filtered predictions: 48616 clips remaining


In [17]:
# Optional: Load precision data for iteration > 1
if Path(precision_file).exists():
    precision_data = pd.read_csv(precision_file)
    precision_dict = dict(zip(precision_data['class'], precision_data['precision']))
else:
    precision_dict = {}

In [18]:
# Group by class and select top-scoring clips
selected_clips = []
for class_name, group in predictions.groupby('class'):
    if class_name.lower() == 'nocall':
        continue  # Skip 'nocall' predictions

    # Sort by score in descending order
    group = group.sort_values(by='score', ascending=False)

    # Determine number of clips to select based on precision
    num_clips = num_clips_default
    if class_name in precision_dict and precision_dict[class_name] > 0.5:
        num_clips = 64

    # Select top-scoring clips
    selected_clips.append(group.head(num_clips))

In [19]:
# Combine all selected clips into a single DataFrame
if selected_clips:
    selected_df = pd.concat(selected_clips, ignore_index=True)

    # Save selected clips into class-specific directories
    for _, row in selected_df.iterrows():
        # Extract class and output directory
        class_name = row['class']
        class_dir = Path(output_dir) / class_name
        class_dir.mkdir(parents=True, exist_ok=True)

        # Construct the output filename (restoring the score for sorting)
        score = row['score']
        offset = int(row['offset'])
        filename = Path(row['file_path']).name
        output_filename = f"{score}_{offset}_{filename}"  # Keep original extension

        # Load and save the audio segment
        audio_path = Path(audio_files_dir) / filename
        try:
            # Extract the audio segment using Opensoundscape
            audio = Audio.from_file(audio_path, offset=offset, duration=3)  # Assuming 3-second segments
            audio.save(class_dir / output_filename)
            #print(f"Saved: {output_filename} to {class_dir}")
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
            
    print(f"Selected clips have been saved to: {output_dir}")
else:
    print("No clips selected for validation.")



Error processing /mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN11_20220730_182000.WAV: [Errno 2] No such file or directory: '/mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN11_20220730_182000.WAV'
Error processing /mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN15_20220804_071000.WAV: [Errno 2] No such file or directory: '/mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN15_20220804_071000.WAV'
Error processing /mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN11_20220730_182000.WAV: [Errno 2] No such file or directory: '/mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN11_20220730_182000.WAV'
Error processing /mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN08_20220727_182000.WAV: [Errno 2] No such file or directory: '/mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M12_SN08_20220727_182000.WAV'
Error processing /mnt/d/Disco3_Backup/night_recordings/M12-ARU36-2007-0408/M