## Create one hot encoded dataset for true labels model test set

In [1]:
#other utilities and packages
import opensoundscape
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

  from tqdm.autonotebook import tqdm


In [2]:
from glob import glob
# Specify folder containing Raven annotations
raven_files_dir = '/mnt/d/retraining_BirdNET/model_test_2025/input_ready/'

# Find all .txt files
# We'll naively assume all files with the suffix ".txt" are Raven files!
# A better assumption could be to search for files with the suffix ".selections.txt"
raven_files = glob(f"{raven_files_dir}/*.txt")
print(f"found {len(raven_files)} annotation files")

# Specify folder containing audio files
audio_files_dir = '/mnt/d/retraining_BirdNET/model_test_2025/input_ready/'

# Find all audio files (we'll assume they are .wav, .WAV, or .mp3)
audio_files = glob(f"{audio_files_dir}/*.wav") + glob(f"{audio_files_dir}/*.WAV")
print(f"found {len(audio_files)} audio files")

found 208 annotation files
found 208 audio files


In [3]:
# Create dataframes for audio and raven files
from pathlib import Path
audio_df = pd.DataFrame({'audio_file': audio_files})
# Extract the stem and remove possible file type variations
audio_df['base_name'] = audio_df['audio_file'].apply(lambda x: Path(x).stem)

In [4]:
# Pair up the Raven and audio files based on the audio file name
raven_df = pd.DataFrame({'raven_files': raven_files})
# Extract the base name by removing the suffix starting with '.BirdNET.selection.table'
raven_df['base_name'] = raven_df['raven_files'].apply(lambda x: Path(x).stem)

In [5]:
# Merge the dataframes on the base_name
paired_df = pd.merge(audio_df, raven_df, on='base_name', how='outer')

In [6]:
print(paired_df.isna().sum()) #look for NAs

audio_file     0
base_name      0
raven_files    0
dtype: int64


In [7]:
from opensoundscape.annotations import BoxedAnnotations
boxed_annotations = BoxedAnnotations.from_raven_files(paired_df.raven_files,paired_df.audio_file)
boxed_annotations.df.head(3)

Unnamed: 0,audio_file,annotation_file,annotation,start_time,end_time,low_f,high_f,View,Lluvia,Channel,Annotation,Selection,Vocalizacion,Observaciones
0,/mnt/d/retraining_BirdNET/model_test_2025/inpu...,/mnt/d/retraining_BirdNET/model_test_2025/inpu...,rooster,3.940505,4.999979,404.1,3151.7,Spectrogram 1,,1,,1,,
1,/mnt/d/retraining_BirdNET/model_test_2025/inpu...,/mnt/d/retraining_BirdNET/model_test_2025/inpu...,rooster,0.647163,2.531127,794.7,1872.1,Spectrogram 1,,1,,1,,
2,/mnt/d/retraining_BirdNET/model_test_2025/inpu...,/mnt/d/retraining_BirdNET/model_test_2025/inpu...,rooster,2.847518,4.999979,417.5,3138.2,Spectrogram 1,,1,,1,,


In [8]:
label_df = boxed_annotations.one_hot_clip_labels(
    clip_duration=3.0,#cut them in 3.0 s
    clip_overlap=0,
    min_label_overlap=0,
    class_subset=['T.furcata_song','T.alba_song', 'M.choliba_song','S.hylophila_song','S.hylophila_call','S.hylophila_call1',
                  'G.brasilianum_song', 'M.choliba_song', 'M.atricapilla_song', 'M.sancta_songF','M.sacntacat_song','M.sancta_songDuet','Sanctacat','SanctacatFAgit',
                  'A.harrisii_song','A.rufus', 'A.rufus_song','A.sericocaudatus', 'A.sericocaudatus_song', 'A.stygius_song', 'A.stygius_call', 'A.clamator_song', 
                  'A.clamator_call','A.cunicularia_song','A.cunicularia_call','C.huhula_song','C.huhula_call', 'C.huhula_call1', 'C.virgata_song', 'C.virgata_call',
                  'N.aethereus_song','Nyctibius aethereus', 'N.griseus_song','Nyctibius griseus','N.albicollis_song', 'Nyctidromus albicollis', 'N. albicollis', 
                  'N.albicollis','N.ocellatus_song', 'Nyctiphrinus ocellatus','L.semitorquatus_song','L. semitorquatus','Lurocalis semitorquatus', 
                  'P.koeniswaldiana_song','S.parvula_song','Setopagis parvula', 'S. parvula','Setopagis_parvula', 'B.ruficapillus_song', 'O.capueira_song',
                  'C.obsoletus_song', 'frogs', 'rooster'], 
                  
    final_clip='full',
)
label_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,T.furcata_song,T.alba_song,M.choliba_song,S.hylophila_song,S.hylophila_call,S.hylophila_call1,G.brasilianum_song,M.atricapilla_song,M.sancta_songF,M.sacntacat_song,...,Nyctiphrinus ocellatus,L.semitorquatus_song,L. semitorquatus,Lurocalis semitorquatus,O.capueira_song,P.koeniswaldiana_song,S.parvula_song,Setopagis parvula,S. parvula,Setopagis_parvula
file,start_time,end_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
"/mnt/d/retraining_BirdNET/model_test_2025/input_ready/Megascops sanctaecatarinae 16 noviembre 2020, pareja, PP Caa Yari, arroyo Tambero, despejado, calmo, fr.wav",0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"/mnt/d/retraining_BirdNET/model_test_2025/input_ready/Megascops sanctaecatarinae 16 noviembre 2020, pareja, PP Caa Yari, arroyo Tambero, despejado, calmo, fr.wav",3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"/mnt/d/retraining_BirdNET/model_test_2025/input_ready/Megascops sanctaecatarinae 16 noviembre 2020, pareja, PP Caa Yari, arroyo Tambero, despejado, calmo, fr.wav",6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# First, ensure that the 'file' column is present and then apply fillna
if 'file' in label_df.columns:
    label_df['file'] = label_df['file'].fillna(method='ffill')

# Now save the DataFrame to a CSV file
label_df.to_csv('/mnt/d/retraining_BirdNET/model_test_2025/input_ready/one-hot-encoded_validation2.csv', index=True)

I have transformed the .xlsx into a .csv but now we have to fix it because the dataset has a file column where the file paths are listed only in the first row of each segment and are followed by NaN values. 
To fill in the missing file paths, we can use the 'fillna' method with the method='ffill' parameter, which propagates the last valid observation forward.

### Modify .csv to match BirdNET predicted classes

I will combine the columns in my df to match those of BirdNET prediction classes. For example I have four columns for M. sanctaecatarinae while BirdNET will only generate predictions for one of them

In [10]:
import pandas as pd

# Load your existing DataFrame
df = pd.read_csv('/mnt/e/retraining_BirdNET/model_test/input_ready/one-hot-encoded_validation.csv')

# Define a dictionary with new column names and the columns to combine
combine_columns = {
    'Barn Owl': ['T.furcata_song', 'T.alba_song'],'Rusty-barred Owl_song':['S.hylophila_song'],
    'Rusty-barred Owl_call': ['S.hylophila_call'], 'Rusty-barred Owl_call1':['S.hylophila_call1'], 
    'Ferruginous Pygmy-Owl':['G.brasilianum_song'], 'Tropical Screech-Owl':['M.choliba_song'], 
    'Black-capped Screech-Owl':['M.atricapilla_song'], 'Long-tufted Screech-Owl':['M.sancta_songF', 'M.sacntacat_song','M.sancta_songDuet','Sanctacat','SanctacatFAgit'],
    'Buff-fronted Owl':['A.harrisii_song'],'Rufous Nightjar':['A.rufus_song','A.rufus'],'Silky-tailed Nightjar':['A.sericocaudatus','A.sericocaudatus_song'],
    'Stygian Owl_song':['A.stygius_song'],'Stygian Owl_call':['A.stygius_call'], 'Striped Owl_song':['A.clamator_song'],
    'Striped Owl_call':['A.clamator_call'],'Burrowing Owl':['A.cunicularia_song'], 'Rufous-capped Motmot':['B.ruficapillus_song'],
    'Black-banded Owl':['C.huhula_song','C.huhula_call', 'C.huhula_call1'],'Mottled Owl_song':['C.virgata_song'],'Mottled Owl_call':['C.virgata_call'], 
    'Brown Tinamou':['C.obsoletus_song'],'Ocellated Poorwill':['N.ocellatus_song','Nyctiphrinus ocellatus'],'Long-tailed Potoo':['N.aethereus_song','Nyctibius aethereus'],
    'Common Potoo':['N.griseus_song', 'Nyctibius griseus'],'Common Pauraque':['N.albicollis_song', 'Nyctidromus albicollis', 'N. albicollis', 'N.albicollis'],
    'Short-tailed Nighthawk':['L.semitorquatus_song', 'L. semitorquatus', 'Lurocalis semitorquatus'],
    'Spot-winged Wood-Quail':['O.capueira_song'],'Tawny-browed Owl':['P.koeniswaldiana_song'],'Little Nightjar':['S.parvula_song','Setopagis parvula', 'Setopagis_parvula','S. parvula']
}

# Process each new column
for new_col, old_cols in combine_columns.items():
    df[new_col] = df[old_cols].sum(axis=1).clip(upper=1)
    df.drop(columns=old_cols, inplace=True)

# Save the updated DataFrame to a new CSV file
df.to_csv('/mnt/e/retraining_BirdNET/model_test/input_ready/one-hot-encoded_validation.csv', index=False)


### I will fix the start and end times that do not match the expected 3 s sequence

In [11]:
import pandas as pd

# Load the CSV file directly into a DataFrame
data_path = '/mnt/e/retraining_BirdNET/model_test/input_ready/one-hot-encoded_validation.csv'
df = pd.read_csv(data_path)

# Function to adjust times
def adjust_times(group):
    # For groups with only one segment and the end time is less than 3
    if len(group) == 1 and group.iloc[0]['end_time'] < 3:
        group.iloc[0, group.columns.get_loc('end_time')] = 3
    else:
        # Process as usual for last row adjustments
        last_row = group.iloc[-1]
        if last_row['end_time'] % 3 != 0:
            if len(group) > 1:  # There's a previous row to reference
                new_start_time = group.iloc[-2]['end_time']
            else:  # Single row handling
                new_start_time = 0
            new_end_time = 3 * ((new_start_time // 3) + 1)
            group.iloc[-1, group.columns.get_loc('start_time')] = new_start_time
            group.iloc[-1, group.columns.get_loc('end_time')] = new_end_time
    return group

# Apply the function to each group of audio files
adjusted_df = df.groupby('file').apply(adjust_times)
print(adjusted_df)

# Save the adjusted DataFrame to a new CSV file
adjusted_df.to_csv('/mnt/e/retraining_BirdNET/model_test/input_ready/one-hot-encoded_validation.csv', index=False)


                                                                                                      file  \
file                                                                                                         
/mnt/e/retraining_BirdNET/model_test/input_read... 1796  /mnt/e/retraining_BirdNET/model_test/input_rea...   
                                                   1797  /mnt/e/retraining_BirdNET/model_test/input_rea...   
/mnt/e/retraining_BirdNET/model_test/input_read... 1421  /mnt/e/retraining_BirdNET/model_test/input_rea...   
                                                   1422  /mnt/e/retraining_BirdNET/model_test/input_rea...   
/mnt/e/retraining_BirdNET/model_test/input_read... 1458  /mnt/e/retraining_BirdNET/model_test/input_rea...   
...                                                                                                    ...   
/mnt/e/retraining_BirdNET/model_test/input_read... 670   /mnt/e/retraining_BirdNET/model_test/input_rea...   
          

I will count the empty rows that will correspond to the non event classes