### Selecting clips for validation per class based on their score 

In [1]:
import pandas as pd
import glob
import os
from opensoundscape import Audio, audio
from pathlib import Path

  from tqdm.autonotebook import tqdm


In [2]:
# Directories
annotations_dir = '/mnt/e/retraining_BirdNET/iterative_training/output_results/2nd_model/'
audio_files_dir = '/mnt/e/retraining_BirdNET/iterative_training/input/'  # Update this if different from annotations directory
output_dir = '/mnt/e/retraining_BirdNET/iterative_training/segments_validation/2nd_model/'

In [3]:
# Get list of all text files with BirdNET predictions
table_filenames = glob.glob(f"{annotations_dir}/*.txt")


In [4]:
# List to hold dataframes
list_of_birdnet_dfs = []

In [5]:
# Read each annotation file and select necessary columns
for table_filename in table_filenames:
    df = pd.read_csv(table_filename, sep='\t', usecols=['Begin Time (s)', 'End Time (s)', 'Common Name', 'Confidence'])
    df.rename(columns={
        'Begin Time (s)': 'start_time',
        'End Time (s)': 'end_time',
        'Common Name': 'class',
        'Confidence': 'score'
    }, inplace=True)
    df['filename'] = Path(table_filename).stem.replace('.BirdNET.selection.table', '')
    list_of_birdnet_dfs.append(df)

In [6]:
# Combine all dataframes into a single big dataframe
big_df = pd.concat(list_of_birdnet_dfs, ignore_index=True)

In [7]:
top_entries_per_class = []

# Assuming 'big_df' is your combined DataFrame with all entries
for class_name, class_df in big_df.groupby(by='class'):
    # Skip the 'nocall' class
    if class_name == 'nocall':
        continue
    
    # Sort by score and get the top 40 highest scoring files for this class
    top_40_for_this_class = class_df.sort_values(by='score', ascending=False).head(40)
    # Append the top 40 for this class to the list
    top_entries_per_class.append(top_40_for_this_class)

# Combine all top 40 entries into a single DataFrame
all_top_entries = pd.concat(top_entries_per_class)

print(all_top_entries.info())


<class 'pandas.core.frame.DataFrame'>
Index: 1157 entries, 71546 to 247115
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   start_time  1157 non-null   float64
 1   end_time    1157 non-null   float64
 2   class       1157 non-null   object 
 3   score       1157 non-null   float64
 4   filename    1157 non-null   object 
dtypes: float64(3), object(2)
memory usage: 54.2+ KB
None


In [8]:
# Assuming you have already loaded 'all_top_entries' as per your previous code
for idx, row in all_top_entries.iterrows():
    # Create class-specific directory if not exists
    class_name = row['class']
    class_dir = Path(output_dir) / class_name
    class_dir.mkdir(parents=True, exist_ok=True)
    
    # Construct the filename for saving
    filename = row['filename']
    start_time = int(row['start_time'])
    end_time = int(row['end_time'])
    score = row['score']
    output_filename = f"{score}_{start_time}_{end_time}_{filename}.wav"
    
    # Load and save the audio segment
    audio_path = Path(audio_files_dir) / f"{filename}.wav"
    audio = Audio.from_file(audio_path, offset=start_time, duration=end_time - start_time)
    audio.save(class_dir / output_filename)

print("Clips have been saved in their respective class directories.")




Clips have been saved in their respective class directories.
