### Selecting clips for validation per class based on their score 

In [1]:
import pandas as pd
import glob
import os
from opensoundscape import Audio, audio
from pathlib import Path

  from tqdm.autonotebook import tqdm


In [2]:
# Directories
annotations_file = '/mnt/d/retraining_BirdNET/iterative_training_2025/output_results_2025/model_0/BirdNET_SelectionTable.txt' #now is a single file
audio_files_dir = '/mnt/d/retraining_BirdNET/iterative_training_2025/input_20%/'  # Update this if different from annotations directory
output_dir = '/mnt/d/retraining_BirdNET/iterative_training_2025/segments_validation_2025/model_0/'

In [5]:
# Read each annotation file and select the necessary columns
df = pd.read_csv(annotations_file, sep='\t', usecols=['Begin Path', 'File offset (s)', 'Common Name', 'Confidence'])
df.rename(columns={
    'Begin Path': 'file',
    'File offset (s)': 'offset',
    'Common Name': 'class',
    'Confidence': 'score'
}, inplace=True)

df['file'] = Path(table_filename).stem.replace('.BirdNET.selection.table', '')


In [7]:
top_entries_per_class = []

# Assuming 'big_df' is your combined DataFrame with all entries
for class_name, class_df in big_df.groupby(by='class'):
    # Skip the 'nocall' class
    if class_name == 'Noise' or 'Environmental':
        continue
    
    # Sort by score and get the top 32 highest scoring files for this class
    top_32_for_this_class = class_df.sort_values(by='score', ascending=False).head(32)
    # Append the top 32 for this class to the list
    top_entries_per_class.append(top_32_for_this_class)

# Combine all top 32 entries into a single DataFrame
all_top_entries = pd.concat(top_entries_per_class)

print(all_top_entries.info())


<class 'pandas.core.frame.DataFrame'>
Index: 1157 entries, 71546 to 247115
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   start_time  1157 non-null   float64
 1   end_time    1157 non-null   float64
 2   class       1157 non-null   object 
 3   score       1157 non-null   float64
 4   filename    1157 non-null   object 
dtypes: float64(3), object(2)
memory usage: 54.2+ KB
None


In [8]:
# Assuming you have already loaded 'all_top_entries' as per your previous code
for idx, row in all_top_entries.iterrows():
    # Create class-specific directory if not exists
    class_name = row['class']
    class_dir = Path(output_dir) / class_name
    class_dir.mkdir(parents=True, exist_ok=True)
    
    # Construct the filename for saving
    score = row['score']
    offset = int(row['offset'])
    filename = row['filename']
    output_filename = f"{score}_{offset}_{filename}.wav"
    
    # Load and save the audio segment
    audio_path = Path(audio_files_dir) / f"{filename}.wav"
    audio = Audio.from_file(audio_path, offset=start_time, duration=end_time - start_time)
    audio.save(class_dir / output_filename)

print("Clips have been saved in their respective class directories.")




Clips have been saved in their respective class directories.
