### Selecting clips for validation per class based on their score 

In [1]:
import pandas as pd
import glob
import os
from opensoundscape import Audio, audio
from pathlib import Path

  from tqdm.autonotebook import tqdm


In [2]:
# Directories
annotations_dir = '/mnt/d/retraining_BirdNET/iterative_training_2025/output_results_2025/model_0/'
audio_files_dir = '/mnt/d/retraining_BirdNET/iterative_training_2025/input_20%/'  # Update this if different from annotations directory
output_dir = '/mnt/d/retraining_BirdNET/iterative_training_2025/segments_validation_2025/model_0/'

In [3]:
# Get list of all text files with BirdNET predictions
table_filenames = glob.glob(f"{annotations_dir}/*.txt")


In [4]:
# List to hold dataframes
list_of_birdnet_dfs = []

In [5]:
# Read each annotation file and select necessary columns
for table_filename in table_filenames:
    df = pd.read_csv(table_filename, sep='\t', usecols=['File Offset (s)', 'Common Name', 'Confidence'])
    df.rename(columns={
        'File Offset (s)': 'offset',
        'Common Name': 'class',
        'Confidence': 'score'
    }, inplace=True)
    df['filename'] = Path(table_filename).stem.replace('.BirdNET.selection.table', '')
    list_of_birdnet_dfs.append(df)

KeyboardInterrupt: 

In [None]:
# Combine all dataframes into a single big dataframe
big_df = pd.concat(list_of_birdnet_dfs, ignore_index=True)

In [None]:
top_entries_per_class = []

# Assuming 'big_df' is your combined DataFrame with all entries
for class_name, class_df in big_df.groupby(by='class'):
    # Skip the 'Noise' & 'Environmental' class
    if class_name in ['Noise', 'Environmental']:  # in keyword checks if the value of class is one of the elements in the list
        continue
    
    # Sort by score and get the top 32 highest scoring files for this class
    top_32_for_this_class = class_df.sort_values(by='score', ascending=False).head(32)
    # Append the top 32 for this class to the list
    top_entries_per_class.append(top_32_for_this_class)

# Combine all top 32 entries into a single DataFrame
all_top_entries = pd.concat(top_entries_per_class)

print(all_top_entries.info())


In [None]:
# Assuming you have already loaded 'all_top_entries' as per your previous code
for idx, row in all_top_entries.iterrows():
    # Create class-specific directory if not exists
    class_name = row['class']
    class_dir = Path(output_dir) / class_name
    class_dir.mkdir(parents=True, exist_ok=True)
    
    # Construct the filename for saving
    score = row['score']
    offset = int(row['offset'])
    filename = row['filename']
    output_filename = f"{score:.3f}_{offset}_{filename}.wav"
    
    # Load and save the audio segment
    audio_path = Path(audio_files_dir) / f"{filename}.wav"
    try:
        audio = Audio.from_file(audio_path, offset=offset, duration=3)
        audio.save(class_dir / output_filename)
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
