## Create continuous score labels' data frame for BirdNET detections

In [1]:
import os
import pandas as pd
import numpy as np
import librosa

In [2]:
# Function to get the duration of the wav file
def get_wav_duration(wav_path):
    return librosa.get_duration(path=wav_path)

In [3]:
# Define the directories containing the .wav files and BirdNET output .txt files
wav_directory = '/mnt/e/retraining_BirdNET/model_test/input_ready/'
txt_directory = '/mnt/e/retraining_BirdNET/model_test/results/2nd_model_alternative_autotune/'

In [4]:
# Create a list of all .wav files
wav_files = [f for f in os.listdir(wav_directory) if f.lower().endswith('.wav')]
# this code transform all extensions into lowecase before comparing them. 

In [5]:
# Define the desired order of columns
desired_order = [
    "Barn Owl", "Rusty-barred Owl_song", "Rusty-barred Owl_call", "Rusty-barred Owl_call1",
    "Ferruginous Pygmy-Owl", "Tropical Screech-Owl", "Black-capped Screech-Owl", "Long-tufted Screech-Owl",
    "Buff-fronted Owl", "Rufous Nightjar", "Silky-tailed Nightjar", "Stygian Owl_song", "Stygian Owl_call",
    "Striped Owl_song", "Striped Owl_call", "Burrowing Owl", "Rufous-capped Motmot", "Black-banded Owl",
    "Mottled Owl_song", "Mottled Owl_call", "Brown Tinamou", "Ocellated Poorwill", "Long-tailed Potoo",
    "Common Potoo", "Common Pauraque", "Short-tailed Nighthawk", "Spot-winged Wood-Quail", "Tawny-browed Owl",
    "Little Nightjar"
]

In [6]:
# Initialize an empty list to store dataframes for each file
all_dfs = []

# Loop through each .wav file
for wav_file in wav_files:
    # Get the duration of the .wav file
    duration = get_wav_duration(os.path.join(wav_directory, wav_file))
    
    # Create a template dataframe with 3-second segments
    segments = np.arange(0, duration, 3)
    end_segments = np.clip(segments + 3, None, duration)
    template_df = pd.DataFrame({
        'file': os.path.join(wav_directory, wav_file),
        'start_time': segments,
        'end_time': end_segments
    })
    
    # Initialize class columns with zeros
    for cls in desired_order:
        template_df[cls] = 0.0
    
    # Append the dataframe for this file to the list
    all_dfs.append(template_df)
    
# Concatenate all dataframes into a single dataframe
labels_df = pd.concat(all_dfs, ignore_index=True)

# Fill NAs with 0 to ensure all segments without detections are filled with zeros
labels_df = labels_df.fillna(0)

# Reorder the columns to match the desired order
labels_df = labels_df[['file', 'start_time', 'end_time'] + desired_order]


In [7]:
# Now read the BirdNET .txt files and update the labels dataframe
# Loop through each .wav file again to process corresponding .txt files
for wav_file in wav_files:
    # Construct the corresponding BirdNET .txt file name
    txt_file = wav_file.replace('.wav', '.BirdNET.selection.table.txt').replace('.WAV', '.BirdNET.selection.table.txt')
    txt_path = os.path.join(txt_directory, txt_file)
    
    # Read the BirdNET output annotation file into a dataframe
    if os.path.exists(txt_path):
        annotations = pd.read_csv(txt_path, delimiter='\t')
        # Extract relevant columns: start_time, end_time, annotation, score
        annotations = annotations[['Begin Time (s)', 'End Time (s)', 'Common Name', 'Confidence']]
        annotations.columns = ['start_time', 'end_time', 'annotation', 'score']
        
        # Update the labels dataframe with the scores
        for _, row in annotations.iterrows():
            # Find the 3-second segment that overlaps with the annotation
            overlapping_segments = labels_df[
                (labels_df['file'] == os.path.join(wav_directory, wav_file)) &
                (labels_df['start_time'] < row['end_time']) & 
                (labels_df['end_time'] > row['start_time'])
            ]
            
            for idx in overlapping_segments.index:
                labels_df.loc[idx, row['annotation']] = row['score']

# Remove the 'nocall' column if it exists
if 'nocall' in labels_df.columns:
    labels_df = labels_df.drop(columns=['nocall'])
#Quizas deba modificar esto para que directamente elimine todas las columnas que no esten en desired order.

# Save the updated labels dataframe to a CSV file
labels_df.to_csv('/mnt/e/retraining_BirdNET/model_test/results/2nd_model_alternative_autotune/predicted_labels.csv', index=False)

# Verify the number of rows in the final dataframe
print(f"Number of rows in final dataframe: {len(labels_df)}")


Number of rows in final dataframe: 4342


Modify end_time column to match 3 s segments

In [8]:
# Load the CSV file directly into a DataFrame
data_path = '/mnt/e/retraining_BirdNET/model_test/results/2nd_model_alternative_autotune/predicted_labels.csv'
df = pd.read_csv(data_path)

# Function to adjust times
def adjust_times(group):
    # For groups with only one segment and the end time is less than 3
    if len(group) == 1 and group.iloc[0]['end_time'] < 3:
        group.iloc[0, group.columns.get_loc('end_time')] = 3
    else:
        # Process as usual for last row adjustments
        last_row = group.iloc[-1]
        if last_row['end_time'] % 3 != 0:
            if len(group) > 1:  # There's a previous row to reference
                new_start_time = group.iloc[-2]['end_time']
            else:  # Single row handling
                new_start_time = 0
            new_end_time = 3 * ((new_start_time // 3) + 1)
            group.iloc[-1, group.columns.get_loc('start_time')] = new_start_time
            group.iloc[-1, group.columns.get_loc('end_time')] = new_end_time
    return group

# Apply the function to each group of audio files
adjusted_df = df.groupby('file').apply(adjust_times)

# Save the adjusted DataFrame to a new CSV file
adjusted_df.to_csv('/mnt/e/retraining_BirdNET/model_test/results/2nd_model_alternative_autotune/predicted_labels.csv', index=False)

Check that true labels df and predicted labels are aligned and have the same lenght

In [9]:
# Load the true labels dataframe
true_labels_df = pd.read_csv('/mnt/e/retraining_BirdNET/model_test/input_ready/one-hot-encoded_validation.csv')

# Ensure the dataframes are aligned
true_labels_df = true_labels_df.sort_values(by=['file', 'start_time']).reset_index(drop=True)
labels_df = labels_df.sort_values(by=['file', 'start_time']).reset_index(drop=True)

# Check if the unique file names match between the true labels and predicted labels dataframes
unique_true_files = set(true_labels_df['file'].unique())
unique_pred_files = set(labels_df['file'].unique())

missing_in_pred = unique_true_files - unique_pred_files
missing_in_true = unique_pred_files - unique_true_files

print(f"Files in true labels not in predicted labels: {missing_in_pred}")
print(f"Files in predicted labels not in true labels: {missing_in_true}")
print(f"Number of unique files in true labels: {len(unique_true_files)}")
print(f"Number of unique files in predicted labels: {len(unique_pred_files)}")

# Align the dataframes if necessary
if missing_in_pred or missing_in_true:
    print("Dataframes are not aligned properly. Please check the missing files and ensure they are processed correctly.")
else:
    print("Dataframes are aligned and ready for mAP calculation.")

Files in true labels not in predicted labels: set()
Files in predicted labels not in true labels: set()
Number of unique files in true labels: 205
Number of unique files in predicted labels: 205
Dataframes are aligned and ready for mAP calculation.
