## Create one hot encoded dataset for BirdNET annotations

In [1]:
#other utilities and packages
import opensoundscape
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from glob import glob
from pathlib import Path

  from tqdm.autonotebook import tqdm


In [2]:
# Specify folder containing Raven annotations
raven_files_dir = '/mnt/e/mock-train_07032024/test_mock/results/'

# Find all .txt files
# We'll naively assume all files with the suffix ".txt" are Raven files!
# A better assumption could be to search for files with the suffix ".selections.txt"
raven_files = glob(f"{raven_files_dir}/*.txt")
print(f"found {len(raven_files)} annotation files")

# Specify folder containing audio files
audio_files_dir = '/mnt/e/mock-train_07032024/test_mock/'

# Find all audio files (we'll assume they are .wav, .WAV, or .mp3)
audio_files = glob(f"{audio_files_dir}/*.wav") + glob(f"{audio_files_dir}/*.WAV")
print(f"found {len(audio_files)} audio files")

found 205 annotation files
found 205 audio files


In [3]:
# Create dataframes for audio and raven files
audio_df = pd.DataFrame({'audio_file': audio_files})
# Extract the stem and remove possible file type variations
audio_df['base_name'] = audio_df['audio_file'].apply(lambda x: Path(x).stem)

In [4]:
raven_df = pd.DataFrame({'raven_file': raven_files})
# Extract the base name by removing the suffix starting with '.BirdNET.selection.table'
raven_df['base_name'] = raven_df['raven_file'].apply(lambda x: Path(x).stem.split('.BirdNET.selection.table')[0])

In [5]:
# Merge the dataframes on the base_name
paired_df = pd.merge(audio_df, raven_df, on='base_name', how='outer')

In [6]:
# Check for missing data to ensure all files have been matched correctly
print(paired_df.isna().sum())

audio_file    0
base_name     0
raven_file    0
dtype: int64


In [7]:
# Optional: Drop the base_name column if no longer needed
paired_df.drop('base_name', axis=1, inplace=True)

In [8]:
# Display the paired DataFrame
paired_df.head()

Unnamed: 0,audio_file,raven_file
0,/mnt/e/mock-train_07032024/test_mock/0.304_1_2...,/mnt/e/mock-train_07032024/test_mock/results/0...
1,/mnt/e/mock-train_07032024/test_mock/0.388_1_2...,/mnt/e/mock-train_07032024/test_mock/results/0...
2,/mnt/e/mock-train_07032024/test_mock/0.439_2_2...,/mnt/e/mock-train_07032024/test_mock/results/0...
3,/mnt/e/mock-train_07032024/test_mock/0.577_2_2...,/mnt/e/mock-train_07032024/test_mock/results/0...
4,/mnt/e/mock-train_07032024/test_mock/0.725_1_2...,/mnt/e/mock-train_07032024/test_mock/results/0...


In [9]:
from opensoundscape.annotations import BoxedAnnotations
boxed_annotations = BoxedAnnotations.from_raven_files(paired_df.raven_file,paired_df.audio_file)
boxed_annotations.df.head(6)

Unnamed: 0,audio_file,annotation_file,annotation,start_time,end_time,low_f,high_f,score,View,Species Code,Channel,File Offset (s),Selection,Begin Path
0,/mnt/e/mock-train_07032024/test_mock/0.304_1_2...,/mnt/e/mock-train_07032024/test_mock/results/0...,nocall,0.0,3.0,0,15000.0,1.0,Spectrogram 1,nocall,1,0.0,1,/mnt/e/mock-train_07032024/test_mock/0.304_1_2...
1,/mnt/e/mock-train_07032024/test_mock/0.388_1_2...,/mnt/e/mock-train_07032024/test_mock/results/0...,Rufous-capped Motmot,3.0,6.0,0,15000.0,0.1192,Spectrogram 1,rucmot2,1,3.0,1,/mnt/e/mock-train_07032024/test_mock/0.388_1_2...
2,/mnt/e/mock-train_07032024/test_mock/0.439_2_2...,/mnt/e/mock-train_07032024/test_mock/results/0...,nocall,0.0,3.0,0,15000.0,1.0,Spectrogram 1,nocall,1,0.0,1,/mnt/e/mock-train_07032024/test_mock/0.439_2_2...


In [10]:
# Print the column names of the boxed annotations. I see there are many columns I do not need.
print(boxed_annotations.df.columns)

Index(['audio_file', 'annotation_file', 'annotation', 'start_time', 'end_time',
       'low_f', 'high_f', 'score', 'View', 'Species Code', 'Channel',
       'File Offset (s)', 'Selection', 'Begin Path'],
      dtype='object')


In [14]:
label_df = boxed_annotations.one_hot_clip_labels(
    clip_duration=3.0,#los corto en 3.0
    clip_overlap=0,
    min_label_overlap=0,
    class_subset=['Barn Owl', 'Rusty-barred Owl_song','Rusty-barred Owl_call','Rusty-barred Owl_call1',
                  'Ferruginous Pygmy-Owl', 'Tropical Screech-Owl', 'Black-capped Screech-Owl', 'Long-tufted Screech-Owl',
                  'Buff-fronted Owl','Rufous Nightjar', 'Silky-tailed Nightjar', 'Stygian Owl_song', 'Stygian Owl_call', 
                  'Striped Owl_song', 'Striped Owl_call','Burrowing Owl', 'Rufous-capped Motmot', 'Black-banded Owl', 
                  'Mottled Owl_song', 'Mottled Owl_call','Brown Tinamou', 'Ocellated Poorwill','Long-tailed Potoo', 
                  'Common Potoo', 'Common Pauraque', 'Short-tailed Nighthawk', 'Spot-winged Wood-Quail', 
                  'Tawny-browed Owl', 'Little Nightjar'],
    final_clip='full',
)
label_df.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Barn Owl,Rusty-barred Owl_song,Rusty-barred Owl_call,Rusty-barred Owl_call1,Ferruginous Pygmy-Owl,Tropical Screech-Owl,Black-capped Screech-Owl,Long-tufted Screech-Owl,Buff-fronted Owl,Rufous Nightjar,...,Mottled Owl_call,Brown Tinamou,Ocellated Poorwill,Long-tailed Potoo,Common Potoo,Common Pauraque,Short-tailed Nighthawk,Spot-winged Wood-Quail,Tawny-browed Owl,Little Nightjar
file,start_time,end_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
/mnt/e/mock-train_07032024/test_mock/overlaid_randomSNR_492558_C.obsoletus.wav,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/mnt/e/mock-train_07032024/test_mock/overlaid_randomSNR_492558_C.obsoletus.wav,8.53,11.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/mnt/e/mock-train_07032024/test_mock/ARD3_BO10_20191025_232200_S.hylophila_song_7.wav,0.0,1.661043,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# First, ensure that the 'file' column is present and then apply fillna
if 'file' in label_df.columns:
    label_df['file'] = label_df['file'].fillna(method='ffill')

# Now save the DataFrame to a CSV file
label_df.to_csv('/mnt/e/mock-train_07032024/test_mock/results/bn_one-hot-encoded_validation.csv', index=True)

#### This code automatically adress the incorrect time stamps of last segments to be multiple of 3

In [16]:
import pandas as pd

# Load the CSV file directly into a DataFrame
data_path = '/mnt/e/mock-train_07032024/test_mock/results/bn_one-hot-encoded_validation.csv'
df = pd.read_csv(data_path)

# Function to adjust times
def adjust_times(group):
    # For groups with only one segment and the end time is less than 3
    if len(group) == 1 and group.iloc[0]['end_time'] < 3:
        group.iloc[0, group.columns.get_loc('end_time')] = 3
    else:
        # Process as usual for last row adjustments
        last_row = group.iloc[-1]
        if last_row['end_time'] % 3 != 0:
            if len(group) > 1:  # There's a previous row to reference
                new_start_time = group.iloc[-2]['end_time']
            else:  # Single row handling
                new_start_time = 0
            new_end_time = 3 * ((new_start_time // 3) + 1)
            group.iloc[-1, group.columns.get_loc('start_time')] = new_start_time
            group.iloc[-1, group.columns.get_loc('end_time')] = new_end_time
    return group

# Apply the function to each group of audio files
adjusted_df = df.groupby('file').apply(adjust_times)

# Save the adjusted DataFrame to a new CSV file
adjusted_df.to_csv('/mnt/e/mock-train_07032024/test_mock/results/bn_one-hot-encoded_validation.csv', index=False)

In [None]:
adjusted_df.shape

Next.... see if we can 1) create this df with continuous scores (from the .txt); 
2) See if we can calculate metrics using sklearn directly with these two dfs